In [50]:
import pandas as pd
from tqdm.auto import tqdm
import json


### Retrieval evaluation                                                                                                                                                              
The retrieval evaluation is about how good the search is performing

In [11]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [12]:
ground_truth = df_question.to_dict(orient='records')

In [13]:
ground_truth[0]

{'id': 0,
 'question': "What do I need to do after clicking the 'Sign Up' button?"}

In [5]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [4]:
from qdrant_client import QdrantClient, models

In [5]:
qdrant_client = QdrantClient("http://localhost:6333")

In [6]:
def qdrant_search(query, limit=5):

    collection_name="project"
    model_handle = "jinaai/jina-embeddings-v2-small-en"
    
    result_points = qdrant_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    results = [point.payload for point in result_points.points]
    
    return results

In [36]:
def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    result_points = qdrant_client.query_points(
        collection_name="project-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(3 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(3 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )


    results = [point.payload for point in result_points.points]

    return results

In [23]:
relevance_total = []
doc_id = q['id']
results = qdrant_search(q['question'])
relevance = [d['id'] == doc_id for d in results]
relevance_total.append(relevance)

In [30]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [33]:
evaluate(ground_truth, rrf_search)

  0%|          | 0/395 [00:00<?, ?it/s]

{'hit_rate': 0.9063291139240506, 'mrr': 0.7727004219409283}

In [16]:
from openai import OpenAI
client_openai = OpenAI()

### LLM-as-a-Judge

In [30]:
evaluation_prompt_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [34]:
def build_prompt(query, search_results):
    prompt_template = """
            You're are the customer service chatbot of an e-commerce platform. Answer the QUESTION based on the CONTEXT from the FAQ database.
            Use only the facts from the CONTEXT when answering the QUESTION.
            
            QUESTION: {question}
            
            CONTEXT: 
            {context}
            """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"prompt: {doc['prompt']}\nresponse: {doc['response']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [31]:
def llm(prompt, model="gpt-4o-mini"):
    response = client_openai.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    answer = response.choices[0].message.content
    
    return answer

In [48]:
def evaluate_relevance(question, answer):
    prompt = evaluation_prompt_template.format(question=question, answer=answer)
    evaluation = llm(prompt, model="gpt-4o-mini")

    try:
        json_eval = json.loads(evaluation)
        return json_eval
    except json.JSONDecodeError:
        result = {"Relevance": "UNKNOWN", "Explanation": "Failed to parse evaluation"}
        return result

In [45]:
answer

"You can find the 'Sign Up' button on the top right corner of our website."

In [46]:
row['question']

"Where can I find the 'Sign Up' button on your website?"

In [24]:
df_random = df_question.groupby("id", group_keys=False).sample(1, random_state=42)

In [82]:
ids = []
questions = []
answers = [] 
relevances = []
for row in df_random.to_dict(orient='records'):
    ids.append(row['id'])
    questions.append(row['question'])
    answer = llm(build_prompt(row['question'], rrf_search(row['question'])))
    answers.append(answer)
    relevances.append(evaluate_relevance(row['question'], answer)['Relevance'])

In [88]:
df_relevance = pd.DataFrame({'id': ids,
             'question': questions,
             'answer': answers,
             'relevance': relevances
             })

### RELEVANCE 

In [97]:
df_relevance.to_csv('../data/rag-eval.csv', index=False)

In [95]:
df_relevance.query("relevance == 'RELEVANT'").shape[0] / df_relevance.shape[0]

0.8734177215189873