### Load Data
Loading documents and ground truth data for evaluation

In [1]:
# Import required libraries
import json
import pandas as pd
import minsearch
from tqdm.auto import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load documents from processed JSON file
with open('../data/processed/documents-with-ids.json', 'r') as f:
    documents = json.load(f)

# Load ground truth dataset for evaluation from CSV file
df_ground_truth = pd.read_csv('../data/processed/ground-truth-retrieval.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

print(f"Loaded {len(documents)} documents and {len(ground_truth)} ground truth questions")

Loaded 149 documents and 735 ground truth questions


In [3]:
documents[10]

{'location': 'Andhra_Pradesh',
 'doc_id': 'd4402d82c0',
 'content': "Andhra Pradesh is known for its rich history, architecture and culture. Andhra Pradesh has a variety of tourist attractions including beaches, hills, wildlife, forests and temples. Like rest of the Southern India, the culture of Andhra Pradesh is essentially Dravidian, quite different from North India's Sanskrit Hindu culture. Andhra Pradesh was part of the British Madras presidency and then independent India's Madras State until 1953, when Andhra State was formed, with the capital being",
 'id': '450a9d36'}

In [4]:
ground_truth[10]

{'question': 'What are the must-see religious temples to visit in Andhra Pradesh?',
 'id': 'db0beb52'}

###  Joining ground-truth-retrieval.csv with documents-with-ids.json

In [5]:
# Create dictionary for efficient document lookup
documents_dict = {doc['id']: doc for doc in documents}

# Create joined data with only complete question-document pairs
complete_pairs = []

for gt in ground_truth:
    gt_id = gt['id']
    
    # Only include if we have both question and document
    if gt_id in documents_dict:
        doc_record = documents_dict[gt_id]
        
        # Create combined record
        joined_record = {
            'id': gt_id,
            'question': gt['question'],
            'location': doc_record['location'],
            'doc_id': doc_record['doc_id'],
            'content': doc_record['content']
        }
        
        # Add any other fields from ground truth (if there are more columns)
        for key, value in gt.items():
            if key not in ['id', 'question']:
                joined_record[key] = value
        
        # Add any other fields from documents (if there are more columns)
        for key, value in doc_record.items():
            if key not in ['id', 'location', 'doc_id', 'content']:
                joined_record[key] = value
        
        complete_pairs.append(joined_record)

print(f"\nComplete question-document pairs: {len(complete_pairs)}")
print(f"Ground truth questions without matching documents: {len(ground_truth) - len(complete_pairs)}")


Complete question-document pairs: 735
Ground truth questions without matching documents: 0


In [6]:
# Show sample of the final data
print(f"\nSample of complete question-document pairs:")
for i, record in enumerate(complete_pairs[:2]):
    print(f"Pair {i+1}:")
    print(json.dumps(record, indent=2, ensure_ascii=False))
    print("-" * 50)

print(f"\nRecommendation: Use 'question_document_pairs.json' for your retrieval evaluation tasks.")
print(f"This file contains only complete question-document pairs that can be used for training/evaluation.")


Sample of complete question-document pairs:
Pair 1:
{
  "id": "4f80b327",
  "question": "What are the must-see religious sites in Andhra Pradesh for pilgrims?",
  "location": "Andhra_Pradesh",
  "doc_id": "d4402d82c0",
  "content": "Asia > South Asia > India > Southern India > Andhra Pradesh  \n![0_image_0.png](0_image_0.png)"
}
--------------------------------------------------
Pair 2:
{
  "id": "4f80b327",
  "question": "Which natural attractions and caves can tourists explore in Andhra Pradesh?",
  "location": "Andhra_Pradesh",
  "doc_id": "d4402d82c0",
  "content": "Asia > South Asia > India > Southern India > Andhra Pradesh  \n![0_image_0.png](0_image_0.png)"
}
--------------------------------------------------

Recommendation: Use 'question_document_pairs.json' for your retrieval evaluation tasks.
This file contains only complete question-document pairs that can be used for training/evaluation.


In [7]:
# Save the single recommended file
with open('../data/processed/question_document_pairs.json', 'w') as f:
    json.dump(complete_pairs, f, indent=2, ensure_ascii=False)

print(f"\nSaved complete pairs to: '../data/processed/question_document_pairs.json'")


Saved complete pairs to: '../data/processed/question_document_pairs.json'


In [8]:
complete_pairs[0]

{'id': '4f80b327',
 'question': 'What are the must-see religious sites in Andhra Pradesh for pilgrims?',
 'location': 'Andhra_Pradesh',
 'doc_id': 'd4402d82c0',
 'content': 'Asia > South Asia > India > Southern India > Andhra Pradesh  \n![0_image_0.png](0_image_0.png)'}

In [9]:
doc_idx = {d['id']: d for d in complete_pairs}
doc_idx['db0beb52']['content']

'Northern Coast (Alluri Sitharama Raju, Anakapalli, East Godavari, Kakinada, Konaseema, Parvathipuram Manyam, Srikakulam, Visakhapatnam, Vizianagaram, Yanam) Central Coast (Eluru, Krishna, NTR, West Godavari) Southern Coast (Bapatla, Guntur, Nellore, Palnadu, Prakasam, Tirupati) Rayalaseema (Annamayya, Anantapur, Chittoor, Kadapa, Kurnool, Nandyal, Sri Sathya Sai)  \n![0_image_1.png](0_image_1.png) interactive map'

## Index data

We'll generate embeddings using [the sentence transformers](https://sbert.net/) library, if you don't have it, install it with pip:

```bash
pip install sentence-transformers
```

This is a different way of turning sentences into vectors

In [10]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [11]:
# Load your joined question-document pairs
with open('../data/processed/question_document_pairs.json', 'r') as f:
    documents = json.load(f)

In [12]:
documents[0]

{'id': '4f80b327',
 'question': 'What are the must-see religious sites in Andhra Pradesh for pilgrims?',
 'location': 'Andhra_Pradesh',
 'doc_id': 'd4402d82c0',
 'content': 'Asia > South Asia > India > Southern India > Andhra Pradesh  \n![0_image_0.png](0_image_0.png)'}

In [13]:
from tqdm.auto import tqdm

vectors = []

for doc in tqdm(documents):
    question = doc['question']
    text = doc['content']
    # Combine question and content for better semantic representation
    vector = model.encode(question + ' ' + text)
    vectors.append(vector)
print(f"Generated {len(vectors)} vectors for {len(documents)} question-document pairs")

  0%|          | 0/735 [00:00<?, ?it/s]

Generated 735 vectors for 735 question-document pairs


In [14]:
import numpy as np

vectors = np.array(vectors)

In [15]:
from minsearch.vector import VectorSearch

vindex = VectorSearch(keyword_fields=['location'])
vindex.fit(vectors, documents)

<minsearch.vector.VectorSearch at 0x1d231902c50>

## Retrieval

In [16]:
def minsearch_vector_search(vector, location=None):
    """
    Perform vector search using minsearch
    Args:
        vector: Query vector for similarity search
        location: Optional location filter for search results
    Returns:
        List of search results
    """
    filter_dict = {}
    if location:
        filter_dict['location'] = location
    
    return vindex.search(
        vector,
        filter_dict=filter_dict if filter_dict else None,
        num_results=5
    )
def question_text_vector(q):
    """
    Convert question to vector and perform search
    Args:
        q: Dictionary containing question and optionally location
    Returns:
        Search results from vector similarity search
    """
    question = q['question']
    location = q.get('location')  # Get location if provided

    # Encode question to vector
    v_q = model.encode(question)

    return minsearch_vector_search(v_q, location)

In [17]:
question_text_vector(dict(
    question='What are the must-see religious temples to visit in Andhra Pradesh?',
    #location='Karnataka'
))

[{'id': '5da6b2ce',
  'question': 'What are the must-visit temples in Andhra Pradesh for religious tourism?',
  'location': 'Andhra_Pradesh',
  'doc_id': 'd4402d82c0',
  'content': '1  \n![1_image_0.png](1_image_0.png)'},
 {'id': 'a88123db',
  'question': 'What are the must-see religious temples in Andhra Pradesh?',
  'location': 'Andhra_Pradesh',
  'doc_id': 'd4402d82c0',
  'content': 'Here are some of the most notable cities.'},
 {'id': 'db0beb52',
  'question': 'What are the must-see religious temples to visit in Andhra Pradesh?',
  'location': 'Andhra_Pradesh',
  'doc_id': 'd4402d82c0',
  'content': 'Northern Coast (Alluri Sitharama Raju, Anakapalli, East Godavari, Kakinada, Konaseema, Parvathipuram Manyam, Srikakulam, Visakhapatnam, Vizianagaram, Yanam) Central Coast (Eluru, Krishna, NTR, West Godavari) Southern Coast (Bapatla, Guntur, Nellore, Palnadu, Prakasam, Tirupati) Rayalaseema (Annamayya, Anantapur, Chittoor, Kadapa, Kurnool, Nandyal, Sri Sathya Sai)  \n![0_image_1.png](0_

## The RAG flow

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a travel assistant bot that helps users plan their itinerary and discover amazing places to visit. 
Answer the QUESTION based on the CONTEXT from the travel database.
Use only the facts from the CONTEXT when answering the QUESTION.

When answering, consider:
- Must-visit tourist attractions and landmarks
- Cultural experiences and local traditions  
- Historical significance of places
- Best times to visit and travel tips
- Local cuisine and specialties (if mentioned in context)
- Transportation and accessibility information (if available)

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"location: {doc['location']}\nquestion: {doc['question']}\ncontent: {doc['content']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [19]:
from dotenv import load_dotenv
import os
import requests

load_dotenv()  # Loads variables from .env
API_KEY = os.getenv("PERPLEXITY_API_KEY")

In [20]:
 #LLM function to support multiple models
def llm(prompt, model='sonar'):
    """
    Generate response using different LLM models
    Args:
        prompt: Input prompt for the model
        model: Model to use ('sonar', 'sonar-pro', 'phi3')
    Returns:
        Generated response text
    """
    if model in ['sonar', 'sonar-pro']:
        # Perplexity API call
        # calls perplexity API to generate response
        response = requests.post(
            'https://api.perplexity.ai/chat/completions',
            headers={
                'Authorization': f'Bearer {API_KEY}',
                'Content-Type': 'application/json'
            },
            json={
                'model': model,
                'messages': [{"role": "user", "content": prompt}]
            }
        )
        
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content']
        else:
            raise Exception(f"Perplexity API call failed with status {response.status_code}: {response.text}")
    
    elif model == 'phi3':
        # Ollama API call for local phi3
        # calls Ollama's phi3 API to generate response
        try:
            response = requests.post(
                'http://localhost:11434/api/generate',
                json={
                    'model': 'phi3',
                    'prompt': prompt,
                    'stream': False
                }
            )
            
            if response.status_code == 200:
                return response.json()['response']
            else:
                raise Exception(f"Ollama API call failed with status {response.status_code}: {response.text}")
        except requests.exceptions.ConnectionError:
            raise Exception("Could not connect to Ollama. Make sure Ollama is running on localhost:11434 and phi3 model is installed.")
    
    else:
        raise ValueError(f"Unsupported model: {model}. Supported models: 'sonar', 'sonar-pro', 'phi3'")


In [21]:
# # calls perplexity API to generate response
# def llm(prompt, model='sonar'):
#     response = requests.post(
#         'https://api.perplexity.ai/chat/completions',
#         headers={
#             'Authorization': f'Bearer {API_KEY}',
#             'Content-Type': 'application/json'
#         },
#         json={
#             'model': model,
#             'messages': [{"role": "user", "content": prompt}]
#         }
#     )
    
#     if response.status_code == 200:
#         return response.json()['choices'][0]['message']['content']
#     else:
#         raise Exception(f"API call failed with status {response.status_code}: {response.text}")

In [22]:
# retrieve relevant documents and generate answer

def rag(query: dict, model='sonar') -> str:
    search_results = question_text_vector(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [23]:
ground_truth[10]

{'question': 'What are the must-see religious temples to visit in Andhra Pradesh?',
 'id': 'db0beb52'}

In [24]:
rag(ground_truth[10], model='sonar') #testing

Exception: Perplexity API call failed with status 401: <html>
<head><title>401 Authorization Required</title></head>
<body>
<center><h1>401 Authorization Required</h1></center>
<hr><center>openresty/1.27.4</center>
<script>(function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'97080fb47aaedfa6',t:'MTc1NTQyMjA2OC4wMDAwMDA='};var a=document.createElement('script');a.nonce='';a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();</script></body>
</html>


In [None]:
rag(ground_truth[10], model='sonar-pro') #testing

In [None]:
try:
    print("Testing phi3 model:")
    print(rag(ground_truth[10], model='phi3')) # testing
except Exception as e:
    print(f"Phi3 model test failed: {e}")

In [None]:
doc_idx['db0beb52']['content']

'Northern Coast (Alluri Sitharama Raju, Anakapalli, East Godavari, Kakinada, Konaseema, Parvathipuram Manyam, Srikakulam, Visakhapatnam, Vizianagaram, Yanam) Central Coast (Eluru, Krishna, NTR, West Godavari) Southern Coast (Bapatla, Guntur, Nellore, Palnadu, Prakasam, Tirupati) Rayalaseema (Annamayya, Anantapur, Chittoor, Kadapa, Kurnool, Nandyal, Sri Sathya Sai)  \n![0_image_1.png](0_image_1.png) interactive map'

In [None]:
def compare_models(query, models=['sonar', 'sonar-pro', 'phi3']):
    """
    Compare responses from different models for the same query
    Args:
        query: Dictionary containing question and optional location
        models: List of models to compare
    Returns:
        Dictionary with model responses
    """
    results = {}
    
    for model in models:
        try:
            print(f"Testing {model}...")
            answer = rag(query, model=model)
            results[model] = {
                'answer': answer,
                'status': 'success'
            }
        except Exception as e:
            results[model] = {
                'answer': None,
                'status': 'error',
                'error': str(e)
            }
            print(f"Error with {model}: {e}")
    
    return results

In [None]:
# Example comparison
test_query = ground_truth[10]
comparison_results = compare_models(test_query)

for model, result in comparison_results.items():
    print(f"\n{'='*50}")
    print(f"Model: {model}")
    print(f"Status: {result['status']}")
    if result['status'] == 'success':
        print(f"Answer: {result['answer'][:200]}...")
    else:
        print(f"Error: {result['error']}")

## Cosine similarity metric

In [None]:
# Example of computing cosine similarity between two answers
answer_orig = 'The must-see religious temples to visit in Andhra Pradesh include:\n\n- **Tirumala Venkateswara Temple** in Tirumala hills, dedicated to Lord Venkateswara. It is highly revered and attracts millions of devotees annually for its spiritual significance and rich traditions[1][3][4].\n\n- **Srisailam Mallikarjuna Temple**, a Jyotirlinga temple nestled in the Nallamala forests, dedicated to Lord Shiva and Goddess Parvati, known for its historical importance and Dravidian architecture[1][3][4].\n\n- **Sri Kalahasteeswara Temple**, known for Rahu-Ketu dosha remedies and famous for its striking Dravidian architecture, dedicated to Lord Shiva[1].\n\n- **Ahobilam Temple** complex located in the Nallamala hills, dedicated to Lord Narasimha in nine forms across nine shrines. It offers spiritual tranquility amidst natural beauty and is a prime pilgrimage site[1][3].\n\n- **Kanaka Durga Temple** in Vijayawada, dedicated to Goddess Durga. This temple is renowned for its Dravidian architectural style, religious legends, and location atop Indrakeeladri hills by the Krishna River. It is a major destination for religious tourism in Vijayawada[2][4].\n\n- Additional notable temples for religious tourism include **Sri Yagantiswamy (Uma Maheshwara) Temple** at Yaganti in Kurnool district, famous for its Shiva idol carved from a single stone and historical contributions from multiple South Indian dynasties[5].\n\nThese temples reflect Andhra Pradesh’s rich cultural heritage, blending architectural grandeur with profound religious significance, offering authentic spiritual experiences. The best time to visit is often during festival seasons specific to each temple, and many temples provide online booking for darshan and seva, enhancing accessibility for pilgrims[4].'
answer_llm = 'Northern Coast (Alluri Sitharama Raju, Anakapalli, East Godavari, Kakinada, Konaseema, Parvathipuram Manyam, Srikakulam, Visakhapatnam, Vizianagaram, Yanam) Central Coast (Eluru, Krishna, NTR, West Godavari) Southern Coast (Bapatla, Guntur, Nellore, Palnadu, Prakasam, Tirupati) Rayalaseema (Annamayya, Anantapur, Chittoor, Kadapa, Kurnool, Nandyal, Sri Sathya Sai)  \n![0_image_1.png](0_image_1.png) interactive map'

# Encode both answers to vectors
v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

# Compute cosine similarity using dot product (vectors are normalized)
similarity_score = v_llm.dot(v_orig)
print(f"Cosine similarity: {similarity_score}")

Cosine similarity: 0.4495590627193451


In [None]:
ground_truth[0]

{'question': 'What are the must-see religious sites in Andhra Pradesh for pilgrims?',
 'id': '4f80b327'}

In [None]:
len(ground_truth)

735

In [None]:
answers = {}

#### Tried parallel processing to optimize code - original code completion time displayed as >1 hour 30 mins.

In [None]:
# ADAPTIVE RATE LIMITING - Dynamically adjusts to API limits
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random
from functools import wraps
import threading

# Global variables to track rate limiting
rate_limit_counter = 0
rate_limit_lock = threading.Lock()
current_delay = 0.1  # Start with minimal delay
max_workers_current = 3  # Will be adjusted dynamically

def adaptive_rate_limit_retry(max_retries=3):
    """
    Adaptive retry with increasing delays based on rate limit frequency
    """
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            global rate_limit_counter, current_delay
            
            for attempt in range(max_retries):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if "429" in str(e) or "rate limit" in str(e).lower():
                        with rate_limit_lock:
                            rate_limit_counter += 1
                            
                        if attempt < max_retries - 1:
                            # Adaptive delay based on rate limit frequency
                            base_delay = min(5.0, 1.0 + (rate_limit_counter * 0.1))
                            delay = base_delay * (2 ** attempt) + random.uniform(0, 2)
                            
                            print(f"Rate limit #{rate_limit_counter}, waiting {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
                            time.sleep(delay)
                            
                            # Increase global delay if too many rate limits
                            if rate_limit_counter % 10 == 0:
                                current_delay = min(2.0, current_delay + 0.2)
                                print(f"Increased base delay to {current_delay:.1f}s due to frequent rate limits")
                                
                            continue
                    raise e
            return None
        return wrapper
    return decorator

@adaptive_rate_limit_retry(max_retries=4)
def llm_with_adaptive_retry(prompt, model='sonar'):
    """LLM call with adaptive retry logic"""
    return llm(prompt, model)

def rag_with_adaptive_retry(query: dict, model='sonar') -> str:
    """RAG pipeline with adaptive retry logic"""
    search_results = question_text_vector(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm_with_adaptive_retry(prompt, model=model)
    return answer

def process_single_record_with_model(item, model='sonar'):
    """Process record with specific model and adaptive delays"""
    i, rec = item
    
    try:
        # Adaptive delay based on current rate limiting (only for API models)
        if model in ['sonar', 'sonar-pro']:
            adaptive_delay = current_delay + random.uniform(0, current_delay)
            time.sleep(adaptive_delay)
        
        answer_llm = rag_with_adaptive_retry(rec, model=model)
        
        doc_id = rec['id']
        if doc_id in doc_idx:
            original_doc = doc_idx[doc_id]
            answer_orig = original_doc['content']

            result = {
                'answer_llm': answer_llm,
                'answer_orig': answer_orig,
                'document': doc_id,
                'question': rec['question'],
                'location': rec.get('location', ''),
                'model': model,
            }
            return i, result
        else:
            return i, None
            
    except Exception as e:
        print(f"Failed to process record {i} with {model} after all retries: {e}")
        return i, None

# %%
def process_model_batch(ground_truth, model, output_file):
    """Process all ground truth data for a specific model"""
    print(f"\n{'='*60}")
    print(f"Processing with model: {model}")
    print(f"{'='*60}")
    
    answers = {}
    
    # Get items to process
    items_to_process = [(i, rec) for i, rec in enumerate(ground_truth)]
    print(f"Total records to process: {len(items_to_process)}")

    if len(items_to_process) > 0:
        start_time = time.time()
        
        # Adjust workers based on model type
        if model == 'phi3':
            MAX_WORKERS = 1  # Local model, single worker
        else:
            MAX_WORKERS = 2  # API models, reduced workers
        
        CHUNK_SIZE = 50  # Process 50 at a time
        
        for chunk_start in range(0, len(items_to_process), CHUNK_SIZE):
            chunk_end = min(chunk_start + CHUNK_SIZE, len(items_to_process))
            chunk_items = items_to_process[chunk_start:chunk_end]
            
            print(f"\nProcessing chunk {chunk_start//CHUNK_SIZE + 1}/{(len(items_to_process)-1)//CHUNK_SIZE + 1}")
            print(f"Records {chunk_start}-{chunk_end - 1} of {len(ground_truth)} ({model})")
            
            with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                future_to_item = {
                    executor.submit(process_single_record_with_model, item, model): item 
                    for item in chunk_items
                }
                
                with tqdm(total=len(chunk_items), desc=f"{model} Chunk {chunk_start//CHUNK_SIZE + 1}") as pbar:
                    for future in as_completed(future_to_item):
                        try:
                            i, result = future.result()
                            if result is not None:
                                answers[i] = result
                            pbar.update(1)
                            
                        except Exception as e:
                            print(f"Error in future: {e}")
                            pbar.update(1)
            
            # Break between chunks for API models
            if model in ['sonar', 'sonar-pro'] and chunk_end < len(items_to_process):
                cooldown_time = 3 + (rate_limit_counter * 0.05)
                print(f"Cooling down for {cooldown_time:.1f}s before next chunk...")
                time.sleep(cooldown_time)
                
            print(f"Completed {len(answers)} out of {len(ground_truth)} total records")

In [None]:
# Convert to results list
results = [None] * len(ground_truth)
for i, val in answers.items():
    results[i] = val.copy()
    results[i].update(ground_truth[i])

In [None]:
# Save results
df_results = pd.DataFrame(results)
df_results.to_csv(output_file, index=False)
print(f"Saved results to: {output_file}")

elapsed_time = time.time() - start_time
print(f"Total processing time for {model}: {elapsed_time:.2f} seconds")

In [None]:
# Process each model
models_to_test = [
    ('sonar', '../data/results/results-sonar.csv'),
    ('sonar-pro', '../data/results/results-sonar-pro.csv'),
]

In [None]:
# Only add phi3 if Ollama is available
try:
    test_response = requests.get('http://localhost:11434/api/version', timeout=5)
    if test_response.status_code == 200:
        models_to_test.append(('phi3', '../data/results/results-phi3.csv'))
        print("Ollama detected - phi3 model will be included in comparison")
    else:
        print("Ollama not responding - phi3 model will be skipped")
except:
    print("Ollama not available - phi3 model will be skipped")

# Process each model
model_results = {}
for model, output_file in models_to_test:
    try:
        df_result = process_model_batch(ground_truth, model, output_file)
        model_results[model] = df_result
        print(f"Successfully processed {len(df_result)} records with {model}")
    except Exception as e:
        print(f"Failed to process {model}: {e}")


#### Cosine Similarity Comparison

In [None]:
def compute_similarity(record):
    """Compute cosine similarity between original and generated answers"""
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    if pd.isna(answer_orig) or pd.isna(answer_llm) or not answer_orig or not answer_llm:
        return 0.0
    
    v_llm = model.encode(str(answer_llm))
    v_orig = model.encode(str(answer_orig))
    
    return v_llm.dot(v_orig)


# Compute similarities for all models
for model_name, df in model_results.items():
    print(f"Computing similarities for {model_name}...")
    
    similarities = []
    results = df.to_dict(orient='records')
    
    for record in tqdm(results, desc=f"Computing similarities for {model_name}"):
        sim = compute_similarity(record)
        similarities.append(sim)
    
    df['cosine'] = similarities

In [None]:
# Save updated results with cosine similarities
output_file = f'../data/results/results-{model_name}-cosine.csv'
df.to_csv(output_file, index=False)

print(f"{model_name} cosine similarity statistics:")
print(df['cosine'].describe())
print(f"Saved to: {output_file}\n")

#### Model Comparison Visualization

In [None]:
# Create comparison plots
plt.figure(figsize=(15, 5))

for i, (model_name, df) in enumerate(model_results.items(), 1):
    plt.subplot(1, len(model_results), i)
    sns.histplot(df['cosine'], bins=30, alpha=0.7, label=model_name)
    plt.title(f'{model_name} Cosine Similarity Distribution')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    plt.legend()

plt.tight_layout()
plt.savefig('../data/results/model_comparison_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

# Create combined comparison plot
plt.figure(figsize=(12, 6))
for model_name, df in model_results.items():
    sns.distplot(df['cosine'], hist=False, label=model_name)

plt.title("Model Performance Comparison - Cosine Similarity")
plt.xlabel("A->Q->A' Cosine Similarity")
plt.ylabel("Density")
plt.legend()
plt.savefig('../data/results/model_comparison_combined.png', dpi=300, bbox_inches='tight')
plt.show()

# Print comparison summary
print("\nModel Comparison Summary:")
print("="*60)
for model_name, df in model_results.items():
    stats = df['cosine'].describe()
    print(f"{model_name.upper()}:")
    print(f"  Mean: {stats['mean']:.4f}")
    print(f"  Std:  {stats['std']:.4f}")
    print(f"  Min:  {stats['min']:.4f}")
    print(f"  Max:  {stats['max']:.4f}")
    print(f"  Median: {stats['50%']:.4f}")
    print()

#### LLM-as-a-Judge Evaluation

In [None]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
# Evaluate samples from each model
def evaluate_model_sample(df, model_name, sample_size=150):
    """Evaluate a sample of results from a specific model"""
    df_sample = df.sample(n=min(sample_size, len(df)), random_state=1)
    samples = df_sample.to_dict(orient='records')
    
    print(f"Evaluating {len(samples)} samples from {model_name}...")
    
    evaluations = []
    for record in tqdm(samples, desc=f"Evaluating {model_name}"):
        prompt = prompt1_template.format(**record)
        try:
            evaluation = llm(prompt, model='sonar')  # Use sonar for evaluation
            evaluations.append(evaluation)
        except Exception as e:
            print(f"Evaluation error: {e}")
            evaluations.append('{"Relevance": "ERROR", "Explanation": "Evaluation failed"}')
    
    # Parse JSON evaluations
    json_evaluations = []
    for str_eval in evaluations:
        try:
            json_eval = json.loads(str_eval)
            json_evaluations.append(json_eval)
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            json_evaluations.append({"Relevance": "ERROR", "Explanation": "JSON parsing failed"})
    
    df_evaluations = pd.DataFrame(json_evaluations)
    df_evaluations['model'] = model_name
    
    return df_evaluations

In [None]:
# Evaluate all models
all_evaluations = []

for model_name, df in model_results.items():
    try:
        eval_df = evaluate_model_sample(df, model_name)
        all_evaluations.append(eval_df)
        
        print(f"\n{model_name} evaluation results:")
        print(eval_df['Relevance'].value_counts())
        
        # Save individual evaluation results
        eval_df.to_csv(f'../data/results/evaluations-{model_name}.csv', index=False)
        
    except Exception as e:
        print(f"Failed to evaluate {model_name}: {e}")

In [None]:
# Combine all evaluations for comparison
if all_evaluations:
    df_all_evaluations = pd.concat(all_evaluations, ignore_index=True)
    df_all_evaluations.to_csv('../data/results/evaluations-all-models.csv', index=False)
    
    # Create evaluation comparison
    print("\nEvaluation Summary by Model:")
    print("="*50)
    evaluation_summary = df_all_evaluations.groupby(['model', 'Relevance']).size().unstack(fill_value=0)
    print(evaluation_summary)
    
    # Calculate relevance percentages
    evaluation_percentages = df_all_evaluations.groupby('model')['Relevance'].value_counts(normalize=True).unstack(fill_value=0)
    print("\nRelevance Percentages by Model:")
    print("="*50)
    print(evaluation_percentages.round(3))
    
    # Create visualization of evaluation results
    plt.figure(figsize=(12, 8))
    
    # Plot 1: Count of evaluations by model and relevance
    plt.subplot(2, 2, 1)
    evaluation_summary.plot(kind='bar', ax=plt.gca())
    plt.title('Evaluation Counts by Model and Relevance')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Relevance')
    
    # Plot 2: Percentage of evaluations by model and relevance
    plt.subplot(2, 2, 2)
    evaluation_percentages.plot(kind='bar', ax=plt.gca())
    plt.title('Evaluation Percentages by Model and Relevance')
    plt.ylabel('Percentage')
    plt.xticks(rotation=45)
    plt.legend(title='Relevance')
    
    # Plot 3: Cosine similarity vs Relevance for all models
    plt.subplot(2, 2, 3)
    for model_name in df_all_evaluations['model'].unique():
        model_data = df_all_evaluations[df_all_evaluations['model'] == model_name]
        if len(model_data) > 0:
            relevant_mask = model_data['Relevance'] == 'RELEVANT'
            if relevant_mask.any():
                plt.scatter(model_data.index[relevant_mask], [model_name] * relevant_mask.sum(), 
                           alpha=0.6, label=f'{model_name} (Relevant)', s=30)
    plt.title('Model Performance Overview')
    plt.ylabel('Model')
    plt.xlabel('Sample Index')
    plt.legend()
    
    # Plot 4: Summary metrics
    plt.subplot(2, 2, 4)
    relevant_percentages = evaluation_percentages['RELEVANT'] if 'RELEVANT' in evaluation_percentages.columns else pd.Series()
    if not relevant_percentages.empty:
        relevant_percentages.plot(kind='bar', ax=plt.gca(), color='green', alpha=0.7)
        plt.title('Percentage of RELEVANT Responses by Model')
        plt.ylabel('Percentage')
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig('../data/results/evaluation_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

#### Final Model Performance Summary

In [None]:
def create_performance_summary():
    """Create a comprehensive performance summary of all models"""
    
    summary_data = []
    
    for model_name, df in model_results.items():
        # Basic stats
        cosine_stats = df['cosine'].describe()
        
        # Evaluation stats (if available)
        eval_stats = {}
        try:
            eval_df = pd.read_csv(f'../data/results/evaluations-{model_name}.csv')
            relevance_counts = eval_df['Relevance'].value_counts()
            total_evals = len(eval_df)
            
            eval_stats = {
                'relevant_pct': relevance_counts.get('RELEVANT', 0) / total_evals * 100,
                'partly_relevant_pct': relevance_counts.get('PARTLY_RELEVANT', 0) / total_evals * 100,
                'non_relevant_pct': relevance_counts.get('NON_RELEVANT', 0) / total_evals * 100,
                'error_pct': relevance_counts.get('ERROR', 0) / total_evals * 100,
                'total_evaluated': total_evals
            }
        except FileNotFoundError:
            eval_stats = {
                'relevant_pct': 0,
                'partly_relevant_pct': 0,
                'non_relevant_pct': 0,
                'error_pct': 0,
                'total_evaluated': 0
            }
        
        summary_data.append({
            'model': model_name,
            'total_responses': len(df),
            'cosine_mean': cosine_stats['mean'],
            'cosine_std': cosine_stats['std'],
            'cosine_median': cosine_stats['50%'],
            'cosine_min': cosine_stats['min'],
            'cosine_max': cosine_stats['max'],
            **eval_stats
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv('../data/results/model_performance_summary.csv', index=False)
    
    print("Model Performance Summary:")
    print("=" * 80)
    print(summary_df.round(4))
    
    # Create a ranking based on multiple criteria
    summary_df['combined_score'] = (
        summary_df['cosine_mean'] * 0.4 +  # 40% weight on cosine similarity
        (summary_df['relevant_pct'] / 100) * 0.4 +  # 40% weight on relevance
        (1 - summary_df['cosine_std']) * 0.2  # 20% weight on consistency (lower std is better)
    )
    
    summary_df_ranked = summary_df.sort_values('combined_score', ascending=False)
    
    print(f"\nModel Ranking (Combined Score):")
    print("=" * 50)
    for i, (_, row) in enumerate(summary_df_ranked.iterrows(), 1):
        print(f"{i}. {row['model']}: {row['combined_score']:.4f}")
        print(f"   Cosine Mean: {row['cosine_mean']:.4f}, Relevant: {row['relevant_pct']:.1f}%")
    
    return summary_df_ranked


In [None]:
performance_summary = create_performance_summary()