<a href="https://colab.research.google.com/github/EbubeTheGoat/R.A.G/blob/main/R_A_G.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get("HF_TOKEN")
login(hf_token,add_to_git_credential = True)

In [None]:
document = "/content/200l.pdf"

In [None]:
!pip install pdfplumber
!pip install bitsandbytes accelerate
!pip install langchain-openai
!pip install langchain-chroma
!pip install langchain-huggingface
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

In [None]:


pdf_path = document
all_names = []

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_table()
        if text:
            for row in text:
                if row and len(row) > 1:  # Check row exists and has enough columns
                    name = row[1]
                    if name and name != "NAMES" and not name.startswith("FACULTY"):
                        clean_name = name.replace(",", "").strip().title()
                        all_names.append(clean_name)

print(f"Extracted {len(all_names)} names")

In [None]:
# Process in smaller batches to avoid context length issues



Model = "meta-llama/Llama-3.2-3B-Instruct"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(Model)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    Model,
    quantization_config=quant_config,
    device_map="auto",
    low_cpu_mem_usage=True
)
model.eval()

In [None]:
all_data = []
batch_size = 10

for i in range(0, len(all_names), batch_size):

    batch_names = all_names[i:i+batch_size]

    messages = [
        {"role": "system", "content": "You create synthetic data for pharmacy professionals. Return ONLY valid JSON with no additional text."},
        {"role": "user", "content": f"""Create synthetic data for these pharmacists. Return as JSON array with this exact structure:
[
  {{
    name: Full Name,
    favorite_course: choose between 'CPM282','PCL451','PCH511','PMB442',
    expertise: choose between 'Oncology' , 'Family Medicine','HIV/AIDS and Tuberculosis',
    age: number between 20-35,
    gender: 'Male' or 'Female',
    marital_status: choose between 'Single','Married',
    favorite_sport: choose between 'Football','Basketball','Volleyball','Olympics'
  }}
]

Names: {batch_names}

Return ONLY the JSON array, no other text."""}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        truncation=True,
        max_length=2048,
        add_generation_prompt=True
    ).to("cuda")

    with torch.no_grad():
        output = model.generate(
            inputs,
            max_new_tokens=2048,  # Increased for more names
            do_sample=True,  # Enable sampling for variety
            temperature=0.7,  # Add some randomness
            top_p=0.9
        )

    response = tokenizer.decode(output[0][len(inputs[0]):], skip_special_tokens=True)
    print(f"\nBatch {i//batch_size + 1} response:\n{response}\n")

    # Try to parse JSON
    try:
        batch_data = json.loads(response)
        all_data.extend(batch_data)
    except json.JSONDecodeError:
        print(f"Failed to parse JSON for batch {i//batch_size + 1}")

print(f"\nTotal records created: {len(all_data)}")
print(json.dumps(all_data, indent=2))

In [None]:
all_data[0]

In [None]:
!pip install langchain-text-splitters
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
import os
from google.colab import userdata
MODEL = "gpt-4.1-nano"
db_name = "vector_db"
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

In [None]:
print(all_data[0].keys())

In [None]:
from langchain_core.documents import Document
docs= []
for d in all_data:
  combined_text = ", ".join([f"{key}:{value}" for key, value in d.items()])
  docs.append(Document(page_content=combined_text,metadata={"name":d.get("name")}))



In [None]:
from langchain_text_splitters import MarkdownTextSplitter

In [None]:
text_splitter = MarkdownTextSplitter(chunk_size = 120,chunk_overlap= 60)
chunk = text_splitter.split_documents(docs)
print(f"Divided into {len(chunk)} chunks")
print(f"first chunk: \n\n {chunk[0]}")

In [None]:
model_path = "/content/drive/MyDrive/pharmacist_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f'Model saved to {model_path}')

In [None]:
import pickle
chunks_path = "/content/drive/MyDrive/chunks.pkl"
with open(chunks_path, 'wb') as f:
    pickle.dump(chunk, f)
print("Chunks saved successfully")

In [None]:
model_path = "/content/drive/MyDrive/pharmacist_model"
tokenizer=AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,device_map="auto")

In [None]:
import pickle
chunks_path = "/content/drive/MyDrive/chunks.pkl"
with open(chunks_path, 'rb') as f:
    chunk = pickle.load(f)
print(f"loaded {len(chunk)} chunks")

In [None]:
!pip install langchain-huggingface
!pip install langchain-chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

In [None]:
!pip install langchain-openai
from langchain_openai import OpenAIEmbeddings

import os


In [None]:

#embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings =OpenAIEmbeddings(model = "text-embedding-3-large")

if os.path.exists(db_name):
  Chroma(persist_directory=db_name,embedding_function=embeddings).delete_collection()
vectorstore = Chroma.from_documents(chunk,embedding=embeddings,persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents ")

In [None]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(temperature=0, model_name=MODEL)

In [None]:
retriever.invoke("Who is George Ebubechukwu?")

In [None]:
llm.invoke("Who is George Ebubechukwu?")

In [None]:
import json
import os
from google.colab import userdata
from tqdm import tqdm
from openai import OpenAI # For a progress bar

# 1. Setup API Key and Model (Replace with your actual key name)
# Ensure you have 'OPENAI_API_KEY' in your Colab Secrets (üîë icon)
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
MODEL = "gpt-4.1-nano"
client = OpenAI()

def generate_eval_record(person):
    """Prompts the LLM to create a diverse question/answer pair."""
    prompt = f"""
    You are creating a diverse evaluation dataset for a pharmacist profile system.
    Using the profile below, generate ONE unique question/answer pair.

    Profile:
    - Name: {person['name']}
    - Expertise: {person['expertise']}
    - Age: {person['age']}
    - Marital Status: {person['marital_status']}
    - Favorite Course: {person['favorite_course']}
    - Favorite Sport: {person['favorite_sport']}

    Randomly pick ONE of these categories for the question:
    1. PROFESSIONAL: Ask about their career path or favorite academic course.
    2. PERSONAL: Ask about their hobbies, sport, or family status.
    3. HYPOTHETICAL: Ask how they might use their expertise in a specific pharmacy scenario.
    4. COMPARATIVE: Ask to compare their age or expertise to a typical professional role.

    You MUST randomly choose from all the categories above.Do not skew it to any two categories.
    All categories must be present

    Output format MUST be a single line JSON with no markdown formatting:
    {{"question": "...", "keywords": ["word1", "word2"], "reference_answer": "...", "category": "..."}}
    """

    # Replace this block with your model's specific calling method
    # Example using OpenAI-style client:
    response = client.chat.completions.create(model=MODEL, messages=[{"role": "user", "content": prompt}])
    return json.loads(response.choices[0].message.content)

    # Simulation for demonstration:

# 2. Process all data and save to JSONL
output_filename = "diverse_tests.jsonl"

with open(output_filename, "w", encoding="utf-8") as f:
    for person in tqdm(all_data, desc="Generating Dataset"):
        try:
            # Generate the record
            eval_record = generate_eval_record(person)

            # Write exactly one JSON object per line
            f.write(json.dumps(eval_record) + "\n")
        except Exception as e:
            print(f"Skipping record for {person['name']} due to error: {e}")

print(f"\nSuccessfully created {output_filename}")

In [None]:
# View the first 5 lines of the file
with open("diverse_tests.jsonl", "r") as f:
    for i, line in enumerate(f):
        if i < 5:
            print(line.strip())
        else:
            break

In [None]:
import json
from pathlib import Path
from pydantic import BaseModel, Field

TEST_FILE = str(Path.cwd()/ "diverse_tests.jsonl")


class TestQuestion(BaseModel):
    """A test question with expected keywords and reference answer."""

    question: str = Field(description="The question to ask the RAG system")
    keywords: list[str] = Field(description="Keywords that must appear in retrieved context")
    reference_answer: str = Field(description="The reference answer for this question")
    category: str = Field(description="Question category (e.g., direct_fact, spanning, temporal)")


def load_tests() -> list[TestQuestion]:
    """Load test questions from JSONL file."""
    tests = []
    with open(TEST_FILE, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            tests.append(TestQuestion(**data))
    return tests


In [None]:
tests = load_tests()

In [None]:
len(tests )

In [None]:
example = tests[0]
print(example.question)
print(example.category)
print(example.reference_answer)
print(example.keywords)


In [None]:

from collections import Counter
count = Counter([t.category for t in tests])
count

In [None]:
from langchain_core.documents import Document
Retriever = 5
def fetch_context(question: str) -> list[Document]:
    """
    Retrieve relevant context documents for a question.
    """
    return retriever.invoke(question, k=Retriever)

In [None]:

SYSTEM_PROMPT = """
You are a knowledgeable, friendly assistant who knows  about pharmacists,hobbies and expertise a lot.
You are chatting with a user about the pharmacist's life.
If relevant, use the given context to answer any question.
IF YOU DON'T KNOW THE ANSWER SAY SO.
Context:
{context}
"""

In [None]:
from pathlib import Path
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
from langchain_core.documents import Document

def combined_question(question: str, history: list[dict] = []) -> str:
    """
    Combine all the user's messages into a single string.
    """
    prior = "\n".join(m["content"] for m in history if m["role"] == "user")
    return prior + "\n" + question


def answer_question(question: str, history: list[dict] = []) -> tuple[str, list[Document]]:
    """
    Answer the given question with RAG; return the answer and the context documents.
    """
    combined = combined_question(question, history)
    docs = fetch_context(combined)
    context = "\n\n".join(doc.page_content for doc in docs)
    system_prompt = SYSTEM_PROMPT.format(context=context)
    messages = [SystemMessage(content=system_prompt)]
    messages.extend(convert_to_messages(history))
    messages.append(HumanMessage(content=question))
    response = llm.invoke(messages)
    return response.content, docs

In [None]:
!pip install litellm
import sys
import math
from pydantic import BaseModel, Field
from litellm import completion
from dotenv import load_dotenv



MODEL = "gpt-4.1-nano"
db_name = "vector_db"


class RetrievalEval(BaseModel):
    """Evaluation metrics for retrieval performance."""

    mrr: float = Field(description="Mean Reciprocal Rank - average across all keywords")
    ndcg: float = Field(description="Normalized Discounted Cumulative Gain (binary relevance)")
    keywords_found: int = Field(description="Number of keywords found in top-k results")
    total_keywords: int = Field(description="Total number of keywords to find")
    keyword_coverage: float = Field(description="Percentage of keywords found")


class AnswerEval(BaseModel):
    """LLM-as-a-judge evaluation of answer quality."""

    feedback: str = Field(
        description="Concise feedback on the answer quality, comparing it to the reference answer and evaluating based on the retrieved context"
    )
    accuracy: float = Field(
        description="How factually correct is the answer compared to the reference answer? 1 (wrong. any wrong answer must score 1) to 5 (ideal - perfectly accurate). An acceptable answer would score 3."
    )
    completeness: float = Field(
        description="How complete is the answer in addressing all aspects of the question? 1 (very poor - missing key information) to 5 (ideal - all the information from the reference answer is provided completely). Only answer 5 if ALL information from the reference answer is included."
    )
    relevance: float = Field(
        description="How relevant is the answer to the specific question asked? 1 (very poor - off-topic) to 5 (ideal - directly addresses question and gives no additional information). Only answer 5 if the answer is completely relevant to the question and gives no additional information."
    )

In [None]:
def calculate_mrr(keyword: str, retrieved_docs: list) -> float:
    """Calculate reciprocal rank for a single keyword (case-insensitive)."""
    keyword_lower = keyword.lower()
    for rank, doc in enumerate(retrieved_docs, start=1):
        if keyword_lower in doc.page_content.lower():
            return 1.0 / rank
    return 0.0


def calculate_dcg(relevances: list[int], k: int) -> float:
    """Calculate Discounted Cumulative Gain."""
    dcg = 0.0
    for i in range(min(k, len(relevances))):
        dcg += relevances[i] / math.log2(i + 2)  # i+2 because rank starts at 1
    return dcg


In [None]:
def calculate_ndcg(keyword: str, retrieved_docs: list, k: int = 10) -> float:
    """Calculate nDCG for a single keyword (binary relevance, case-insensitive)."""
    keyword_lower = keyword.lower()

    # Binary relevance: 1 if keyword found, 0 otherwise
    relevances = [
        1 if keyword_lower in doc.page_content.lower() else 0 for doc in retrieved_docs[:k]
    ]

    # DCG
    dcg = calculate_dcg(relevances, k)

    # Ideal DCG (best case: keyword in first position)
    ideal_relevances = sorted(relevances, reverse=True)
    idcg = calculate_dcg(ideal_relevances, k)

    return dcg / idcg if idcg > 0 else 0.0


In [None]:
def evaluate_retrieval(test: TestQuestion, k: int = 10) -> RetrievalEval:
    """
    Evaluate retrieval performance for a test question.

    Args:
        test: TestQuestion object containing question and keywords
        k: Number of top documents to retrieve (default 10)

    Returns:
        RetrievalEval object with MRR, nDCG, and keyword coverage metrics
    """
    # Retrieve documents using shared answer module
    retrieved_docs = fetch_context(test.question)

    # Calculate MRR (average across all keywords)
    mrr_scores = [calculate_mrr(keyword, retrieved_docs) for keyword in test.keywords]
    avg_mrr = sum(mrr_scores) / len(mrr_scores) if mrr_scores else 0.0

    # Calculate nDCG (average across all keywords)
    ndcg_scores = [calculate_ndcg(keyword, retrieved_docs, k) for keyword in test.keywords]
    avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0

    # Calculate keyword coverage
    keywords_found = sum(1 for score in mrr_scores if score > 0)
    total_keywords = len(test.keywords)
    keyword_coverage = (keywords_found / total_keywords * 100) if total_keywords > 0 else 0.0

    return RetrievalEval(
        mrr=avg_mrr,
        ndcg=avg_ndcg,
        keywords_found=keywords_found,
        total_keywords=total_keywords,
        keyword_coverage=keyword_coverage,
    )


In [None]:
def evaluate_answer(test: TestQuestion) -> tuple[AnswerEval, str, list]:
    """
    Evaluate answer quality using LLM-as-a-judge (async).

    Args:
        test: TestQuestion object containing question and reference answer

    Returns:
        Tuple of (AnswerEval object, generated_answer string, retrieved_docs list)
    """
    # Get RAG response using shared answer module
    generated_answer, retrieved_docs = answer_question(test.question)

    # LLM judge prompt
    judge_messages = [
        {
            "role": "system",
            "content": "You are an expert evaluator assessing the quality of answers. Evaluate the generated answer by comparing it to the reference answer. Only give 5/5 scores for perfect answers.",
        },
        {
            "role": "user",
            "content": f"""Question:
{test.question}

Generated Answer:
{generated_answer}

Reference Answer:
{test.reference_answer}

Please evaluate the generated answer on three dimensions:
1. Accuracy: How factually correct is it compared to the reference answer? Only give 5/5 scores for perfect answers.
2. Completeness: How thoroughly does it address all aspects of the question, covering all the information from the reference answer?
3. Relevance: How well does it directly answer the specific question asked, giving no additional information?

Provide detailed feedback and scores from 1 (very poor) to 5 (ideal) for each dimension. If the answer is wrong, then the accuracy score must be 1.""",
        },
    ]

    # Call LLM judge with structured outputs (async)
    judge_response = completion(model=MODEL, messages=judge_messages, response_format=AnswerEval)

    answer_eval = AnswerEval.model_validate_json(judge_response.choices[0].message.content)

    return answer_eval, generated_answer, retrieved_docs


In [None]:
def evaluate_all_retrieval():
    """Evaluate all retrieval tests."""
    tests = load_tests()
    total_tests = len(tests)
    for index, test in enumerate(tests):
        result = evaluate_retrieval(test)
        progress = (index + 1) / total_tests
        yield test, result, progress


def evaluate_all_answers():
    """Evaluate all answers to tests using batched async execution."""
    tests = load_tests()
    total_tests = len(tests)
    for index, test in enumerate(tests):
        result = evaluate_answer(test)[0]
        progress = (index + 1) / total_tests
        yield test, result, progress



In [None]:
def run_cli_evaluation(test_number: int):
    """Run evaluation for a specific test (async helper for CLI)."""
    # Load tests
    tests = load_tests("tests.jsonl")

    if test_number < 0 or test_number >= len(tests):
        print(f"Error: test_row_number must be between 0 and {len(tests) - 1}")
        sys.exit(1)

    # Get the test
    test = tests[test_number]

    # Print test info
    print(f"\n{'=' * 80}")
    print(f"Test #{test_number}")
    print(f"{'=' * 80}")
    print(f"Question: {test.question}")
    print(f"Keywords: {test.keywords}")
    print(f"Category: {test.category}")
    print(f"Reference Answer: {test.reference_answer}")

    # Retrieval Evaluation
    print(f"\n{'=' * 80}")
    print("Retrieval Evaluation")
    print(f"{'=' * 80}")

    retrieval_result = evaluate_retrieval(test)

    print(f"MRR: {retrieval_result.mrr:.4f}")
    print(f"nDCG: {retrieval_result.ndcg:.4f}")
    print(f"Keywords Found: {retrieval_result.keywords_found}/{retrieval_result.total_keywords}")
    print(f"Keyword Coverage: {retrieval_result.keyword_coverage:.1f}%")

    # Answer Evaluation
    print(f"\n{'=' * 80}")
    print("Answer Evaluation")
    print(f"{'=' * 80}")

    answer_result, generated_answer, retrieved_docs = evaluate_answer(test)

    print(f"\nGenerated Answer:\n{generated_answer}")
    print(f"\nFeedback:\n{answer_result.feedback}")
    print("\nScores:")
    print(f"  Accuracy: {answer_result.accuracy:.2f}/5")
    print(f"  Completeness: {answer_result.completeness:.2f}/5")
    print(f"  Relevance: {answer_result.relevance:.2f}/5")
    print(f"\n{'=' * 80}\n")


In [None]:
evaluate_retrieval(example)

In [None]:
eval,answer,chunks = evaluate_answer(example)

In [None]:
eval,answer,chunks = evaluate_answer(example)

In [None]:
import gradio as gr
import pandas as pd
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv(override=True)

# Color coding thresholds - Retrieval
MRR_GREEN = 0.9
MRR_AMBER = 0.75
NDCG_GREEN = 0.9
NDCG_AMBER = 0.75
COVERAGE_GREEN = 90.0
COVERAGE_AMBER = 75.0

# Color coding thresholds - Answer (1-5 scale)
ANSWER_GREEN = 4.5
ANSWER_AMBER = 4.0


def get_color(value: float, metric_type: str) -> str:
    """Get color based on metric value and type."""
    if metric_type == "mrr":
        if value >= MRR_GREEN:
            return "green"
        elif value >= MRR_AMBER:
            return "orange"
        else:
            return "red"
    elif metric_type == "ndcg":
        if value >= NDCG_GREEN:
            return "green"
        elif value >= NDCG_AMBER:
            return "orange"
        else:
            return "red"
    elif metric_type == "coverage":
        if value >= COVERAGE_GREEN:
            return "green"
        elif value >= COVERAGE_AMBER:
            return "orange"
        else:
            return "red"
    elif metric_type in ["accuracy", "completeness", "relevance"]:
        if value >= ANSWER_GREEN:
            return "green"
        elif value >= ANSWER_AMBER:
            return "orange"
        else:
            return "red"
    return "black"


def format_metric_html(
    label: str,
    value: float,
    metric_type: str,
    is_percentage: bool = False,
    score_format: bool = False,
) -> str:
    """Format a metric with color coding."""
    color = get_color(value, metric_type)
    if is_percentage:
        value_str = f"{value:.1f}%"
    elif score_format:
        value_str = f"{value:.2f}/5"
    else:
        value_str = f"{value:.4f}"
    return f"""
    <div style="margin: 10px 0; padding: 15px; background-color: #f5f5f5; border-radius: 8px; border-left: 5px solid {color};">
        <div style="font-size: 14px; color: #666; margin-bottom: 5px;">{label}</div>
        <div style="font-size: 28px; font-weight: bold; color: {color};">{value_str}</div>
    </div>
    """


def run_retrieval_evaluation(progress=gr.Progress()):
    """Run retrieval evaluation and yield updates."""
    total_mrr = 0.0
    total_ndcg = 0.0
    total_coverage = 0.0
    category_mrr = defaultdict(list)
    count = 0

    for test, result, prog_value in evaluate_all_retrieval():
        count += 1
        total_mrr += result.mrr
        total_ndcg += result.ndcg
        total_coverage += result.keyword_coverage

        category_mrr[test.category].append(result.mrr)

        # Update progress bar only
        progress(prog_value, desc=f"Evaluating test {count}...")

    # Calculate final averages
    avg_mrr = total_mrr / count
    avg_ndcg = total_ndcg / count
    avg_coverage = total_coverage / count

    # Create final summary metrics HTML
    final_html = f"""
    <div style="padding: 0;">
        {format_metric_html("Mean Reciprocal Rank (MRR)", avg_mrr, "mrr")}
        {format_metric_html("Normalized DCG (nDCG)", avg_ndcg, "ndcg")}
        {format_metric_html("Keyword Coverage", avg_coverage, "coverage", is_percentage=True)}
        <div style="margin-top: 20px; padding: 10px; background-color: #d4edda; border-radius: 5px; text-align: center; border: 1px solid #c3e6cb;">
            <span style="font-size: 14px; color: #155724; font-weight: bold;">‚úì Evaluation Complete: {count} tests</span>
        </div>
    </div>
    """

    # Create final bar chart data
    category_data = []
    for category, mrr_scores in category_mrr.items():
        avg_cat_mrr = sum(mrr_scores) / len(mrr_scores)
        category_data.append({"Category": category, "Average MRR": avg_cat_mrr})

    df = pd.DataFrame(category_data)

    return final_html, df


def run_answer_evaluation(progress=gr.Progress()):
    """Run answer evaluation and yield updates (async)."""
    total_accuracy = 0.0
    total_completeness = 0.0
    total_relevance = 0.0
    category_accuracy = defaultdict(list)
    count = 0

    for test, result, prog_value in evaluate_all_answers():
        count += 1
        total_accuracy += result.accuracy
        total_completeness += result.completeness
        total_relevance += result.relevance

        category_accuracy[test.category].append(result.accuracy)

        # Update progress bar only
        progress(prog_value, desc=f"Evaluating test {count}...")

    # Calculate final averages
    avg_accuracy = total_accuracy / count
    avg_completeness = total_completeness / count
    avg_relevance = total_relevance / count

    # Create final summary metrics HTML
    final_html = f"""
    <div style="padding: 0;">
        {format_metric_html("Accuracy", avg_accuracy, "accuracy", score_format=True)}
        {format_metric_html("Completeness", avg_completeness, "completeness", score_format=True)}
        {format_metric_html("Relevance", avg_relevance, "relevance", score_format=True)}
        <div style="margin-top: 20px; padding: 10px; background-color: #d4edda; border-radius: 5px; text-align: center; border: 1px solid #c3e6cb;">
            <span style="font-size: 14px; color: #155724; font-weight: bold;">‚úì Evaluation Complete: {count} tests</span>
        </div>
    </div>
    """

    # Create final bar chart data
    category_data = []
    for category, accuracy_scores in category_accuracy.items():
        avg_cat_accuracy = sum(accuracy_scores) / len(accuracy_scores)
        category_data.append({"Category": category, "Average Accuracy": avg_cat_accuracy})

    df = pd.DataFrame(category_data)

    return final_html, df


def main():
    """Launch the Gradio evaluation app."""
    theme = gr.themes.Soft(font=["Inter", "system-ui", "sans-serif"])

    with gr.Blocks(title="RAG Evaluation Dashboard", theme=theme) as app:
        gr.Markdown("# üìä RAG Evaluation Dashboard")
        gr.Markdown("Evaluate retrieval and answer quality for the Insurellm RAG system")

        # RETRIEVAL SECTION
        gr.Markdown("## üîç Retrieval Evaluation")

        retrieval_button = gr.Button("Run Evaluation", variant="primary", size="lg")

        with gr.Row():
            with gr.Column(scale=1):
                retrieval_metrics = gr.HTML(
                    "<div style='padding: 20px; text-align: center; color: #999;'>Click 'Run Evaluation' to start</div>"
                )

            with gr.Column(scale=1):
                retrieval_chart = gr.BarPlot(
                    x="Category",
                    y="Average MRR",
                    title="Average MRR by Category",
                    y_lim=[0, 1],
                    height=400,
                )

        # ANSWERING SECTION
        gr.Markdown("## üí¨ Answer Evaluation")

        answer_button = gr.Button("Run Evaluation", variant="primary", size="lg")

        with gr.Row():
            with gr.Column(scale=1):
                answer_metrics = gr.HTML(
                    "<div style='padding: 20px; text-align: center; color: #999;'>Click 'Run Evaluation' to start</div>"
                )

            with gr.Column(scale=1):
                answer_chart = gr.BarPlot(
                    x="Category",
                    y="Average Accuracy",
                    title="Average Accuracy by Category",
                    y_lim=[1, 5],
                    height=400,
                )

        # Wire up the evaluations
        retrieval_button.click(
            fn=run_retrieval_evaluation,
            outputs=[retrieval_metrics, retrieval_chart],
        )

        answer_button.click(
            fn=run_answer_evaluation,
            outputs=[answer_metrics, answer_chart],
        )

    app.launch(inbrowser=True)


if __name__ == "__main__":
    main()


In [None]:
w