In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import time
import pandas as pd
from typing import Dict, List, Tuple
import json

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv("../.env")
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [3]:
prompt_template = ChatPromptTemplate.from_messages([
    ("system", 
    "You are a helpful travel assistant with expertise in hotels, destinations, and travel recommendations.\n\n"
    "Context:\n{context}\n\n"
    "Instructions:\n"
    "- Answer based ONLY on the provided context\n"
    "- Be specific and cite information from the context\n"
    "- If information is missing, clearly state what you don't know\n"
    "- Provide helpful, natural responses"),
    ("human",
     "User Question: {question}\n\n"
     "Please provide a clear, accurate answer based on the context above.")
])

In [14]:
def setup_llm_models():
    """
    Setup multiple LLM models from different providers.
    """

    base_models = {
        "Mistral": HuggingFaceEndpoint(
            repo_id="mistralai/Mistral-7B-Instruct-v0.2",
            huggingfacehub_api_token=hf_token,
        ),
        "Llama3": HuggingFaceEndpoint(
            repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
            huggingfacehub_api_token=hf_token,
        ),
        "Gemma": HuggingFaceEndpoint(
            repo_id="google/gemma-2-2b-it",
            huggingfacehub_api_token=hf_token,
        )
    }
    models = {name: ChatHuggingFace(llm=model) for name, model in base_models.items()}
    
    return models

In [15]:
test_cases = [
    {
        "question": "Which hotels are in Paris?",
        "context": "Hotel Le Grand is located in Paris with a rating of 4.5. Hotel Petit is also in Paris with rating 4.2. Hotel Berlin is in Berlin with rating 4.7.",
        "expected_answer": "Two hotels: Hotel Le Grand (4.5 rating) and Hotel Petit (4.2 rating)",
        "evaluation_criteria": ["mentions both hotels", "includes ratings", "doesn't mention Berlin hotel"]
    },
    {
        "question": "What is the highest rated hotel?",
        "context": "Hotel Le Grand is located in Paris with a rating of 4.5. Hotel Petit is also in Paris with rating 4.2. Hotel Berlin is in Berlin with rating 4.7.",
        "expected_answer": "Hotel Berlin with 4.7 rating",
        "evaluation_criteria": ["identifies Hotel Berlin", "mentions 4.7 rating", "correctly identifies highest"]
    },
    {
        "question": "Are there hotels in Rome?",
        "context": "Hotel Le Grand is located in Paris with a rating of 4.5. Hotel Petit is also in Paris with rating 4.2. Hotel Berlin is in Berlin with rating 4.7.",
        "expected_answer": "No hotels in Rome in the provided data",
        "evaluation_criteria": ["says no/none", "doesn't hallucinate hotels", "acknowledges limitation"]
    },
    {
        "question": "What amenities does Hotel Le Grand offer?",
        "context": "Hotel Le Grand is located in Paris with a rating of 4.5. Hotel Le Grand offers WiFi, Pool, and Spa. Price is $200 per night.",
        "expected_answer": "WiFi, Pool, and Spa",
        "evaluation_criteria": ["lists all three amenities", "doesn't add fake amenities", "clear and concise"]
    },
    {
        "question": "Which hotel is most affordable?",
        "context": "Hotel Le Grand costs $200/night with 4.5 rating. Hotel Petit costs $150/night with 4.2 rating. Hotel Berlin costs $180/night with 4.7 rating.",
        "expected_answer": "Hotel Petit at $150/night",
        "evaluation_criteria": ["identifies Hotel Petit", "mentions correct price", "clearly states it's cheapest"]
    }
]

In [12]:
class LLMEvaluator:
    """Comprehensive LLM evaluation system"""
    
    def __init__(self, models: Dict, prompt_template: ChatPromptTemplate):
        self.models = models
        self.prompt_template = prompt_template
        self.results = []
    
    def run_single_test(self, model_name: str, model, test_case: Dict) -> Dict:
        """Run a single test case for one model"""
        chain = self.prompt_template | model
        
        start_time = time.time()
        try:
            response = chain.invoke({
                "context": test_case["context"],
                "question": test_case["question"]
            })
            elapsed_time = time.time() - start_time
            
            return {
                "model": model_name,
                "question": test_case["question"],
                "response": response.content,
                "time": elapsed_time,
                "success": True,
                "error": None,
                "tokens_estimate": len(response.content.split())  # Rough estimate
            }
        except Exception as e:
            elapsed_time = time.time() - start_time
            return {
                "model": model_name,
                "question": test_case["question"],
                "response": None,
                "time": elapsed_time,
                "success": False,
                "error": str(e),
                "tokens_estimate": 0
            }

    def test_models(self, test_cases: List[Dict], models: Dict, prompt_template: ChatPromptTemplate):
        """Quick function to test models with custom input""" 
        for test_case in test_cases:
            context = test_case["context"]
            question = test_case["question"]
            for name, model in models.items():
                chain = prompt_template | model
                response = chain.invoke({"context": context, "question": question})
                print(f"\n{name}:")
                print(response.content)
                print("-" * 80)
    
    def evaluate_all_models(self, test_cases: List[Dict]):
        """Run all test cases for all models"""
        print("Starting LLM Evaluation...\n")
        
        for i, test_case in enumerate(test_cases, 1):
            print(f"Test Case {i}/{len(test_cases)}: {test_case['question']}")
            
            for model_name, model in self.models.items():
                print(f"  Testing {model_name}...", end=" ")
                result = self.run_single_test(model_name, model, test_case)
                self.results.append(result)
                
                if result["success"]:
                    print(f"✓ ({result['time']:.2f}s)")
                else:
                    print(f"✗ Error: {result['error'][:50]}")
            
            print()
        
        return self.results
    
    def get_quantitative_metrics(self) -> pd.DataFrame:
        """Calculate quantitative metrics"""
        df = pd.DataFrame(self.results)
        
        metrics = df.groupby('model').agg({
            'time': ['mean', 'std', 'min', 'max'],
            'success': 'mean',
            'tokens_estimate': 'mean'
        }).round(3)
        
        metrics.columns = ['Avg Time (s)', 'Std Time', 'Min Time', 'Max Time', 
                          'Success Rate', 'Avg Tokens']
        
        return metrics
    
    def print_qualitative_comparison(self, test_case_idx: int = 0):
        """Print side-by-side qualitative comparison"""
        test_results = [r for r in self.results if r['question'] == test_cases[test_case_idx]['question']]
        
        print(f"\n{'='*80}")
        print(f"QUALITATIVE COMPARISON - Test Case {test_case_idx + 1}")
        print(f"Question: {test_cases[test_case_idx]['question']}")
        print(f"Context: {test_cases[test_case_idx]['context'][:100]}...")
        print(f"Expected: {test_cases[test_case_idx]['expected_answer']}")
        print(f"{'='*80}\n")
        
        for result in test_results:
            print(f"--- {result['model']} ({result['time']:.2f}s) ---")
            if result['success']:
                print(result['response'])
            else:
                print(f"ERROR: {result['error']}")
            print()
    
    def export_results(self, filename: str = "llm_comparison_results.json"):
        """Export detailed results to JSON"""
        with open(filename, 'w') as f:
            json.dump(self.results, f, indent=2)
        print(f"Results exported to {filename}")

In [16]:
all_models = setup_llm_models()
    
selected_models = all_models

print(f"Selected models: {list(selected_models.keys())}\n")

# Create evaluator
evaluator = LLMEvaluator(selected_models, prompt_template)

# Run evaluation
results = evaluator.evaluate_all_models(test_cases)

# Print quantitative metrics
print("\n" + "="*80)
print("QUANTITATIVE METRICS")
print("="*80)
print(evaluator.get_quantitative_metrics())

# Print qualitative comparison for first test case
evaluator.print_qualitative_comparison(0)

# You can view more test cases
# evaluator.print_qualitative_comparison(1)
# evaluator.print_qualitative_comparison(2)

# Export results
evaluator.export_results()

Selected models: ['Mistral', 'Llama3', 'Gemma']

Starting LLM Evaluation...

Test Case 1/5: Which hotels are in Paris?
  Testing Mistral... ✓ (2.47s)
  Testing Llama3... ✓ (1.30s)
  Testing Gemma... ✓ (0.82s)

Test Case 2/5: What is the highest rated hotel?
  Testing Mistral... ✓ (1.95s)
  Testing Llama3... ✓ (0.97s)
  Testing Gemma... ✓ (0.62s)

Test Case 3/5: Are there hotels in Rome?
  Testing Mistral... ✓ (2.29s)
  Testing Llama3... ✓ (1.35s)
  Testing Gemma... ✓ (0.67s)

Test Case 4/5: What amenities does Hotel Le Grand offer?
  Testing Mistral... ✓ (1.83s)
  Testing Llama3... ✓ (1.14s)
  Testing Gemma... ✓ (0.68s)

Test Case 5/5: Which hotel is most affordable?
  Testing Mistral... ✓ (1.95s)
  Testing Llama3... ✓ (1.31s)
  Testing Gemma... ✓ (0.62s)


QUANTITATIVE METRICS
         Avg Time (s)  Std Time  Min Time  Max Time  Success Rate  Avg Tokens
model                                                                        
Gemma           0.682     0.084     0.620     0.823    