# RAG Evaluation

In [1]:
from dotenv import load_dotenv

load_dotenv('../.env')

True

In [16]:
import os
import pandas as pd
import time
from rag.rag_pipeline import RAGPipeline, GPTunnelLLM
from langchain_community.retrievers import ArxivRetriever
from langsmith import Client, evaluate
from langsmith.schemas import Example, Run
from langchain import hub
from langchain_mistralai import ChatMistralAI

In [3]:
retriever = ArxivRetriever(
        top_k_results=3,
        get_full_documents=False,  # gives errors with MuPDF when True
        doc_content_chars_max=10000000000
    )

gptunell_key = os.environ.get('GPTUNNEL_API_KEY')
gptunnel_llm = GPTunnelLLM(api_key=gptunell_key)

assistant = RAGPipeline(llm=gptunnel_llm, retriever=retriever)

# Example query
question = "How does ImageBind model bind multiple modalities into a single embedding space? Tell me in detail."
response = assistant.handle_user_input(question)
print(response)

ImageBind model binds multiple modalities into a single embedding space through a learnable bind network. This network aligns the embedding space between LLaMA and ImageBind's image encoder. The image features transformed by the bind network are then added to word tokens of all layers in LLaMA, progressively injecting visual instructions via an attention-free and zero-initialized gating mechanism. This process enables the model to exhibit superior multi-modality instruction-following capabilities.


In [4]:
langsmith_client = Client()

In [13]:
eval_questions_df = pd.read_csv('data/evaluation_questions.csv')
dataset_name = "Arxiv RAG Evaluation Questions"

In [24]:
def correct_answer(root_run: Run, example: Example) -> dict:
    score = root_run.outputs.get("output") == example.outputs.get("answer")
    return {"score": int(score), "key": "correct_answer"}

In [25]:
results = evaluate(
    lambda inputs: assistant.handle_user_input(inputs["question"]),
    data=dataset_name,
    evaluators=[correct_answer],
    experiment_prefix="Arxiv RAG Queries",
    # description="Testing the baseline system.",  # optional
)

View the evaluation results for experiment: 'Agent RAG Queries-355d19f5' at:
https://smith.langchain.com/o/76b2dc9d-4b98-4e5e-983b-623eb76c0ac6/datasets/62c45b24-543f-4290-aeb6-cb696dd9cb06/compare?selectedSessions=b3a2cc3c-1bfd-4022-8534-24dd1d3e5a50




0it [00:00, ?it/s]

In [11]:
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = assistant.handle_user_input(example["question"])
    return {"answer": response}

# todo: implement context docs returning
def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = assistant.handle_user_input(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

### Response vs reference answer

In [21]:
# Grade prompt
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["question"]
    reference = example.outputs["ground_truth"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatMistralAI(
        model="mistral-large-latest",
        temperature=0,
        max_retries=2,
        # other params...
    )

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]
    
    time.sleep(3) # isn't clear if this helps
    return {"key": "answer_v_reference_score", "score": score}

In [6]:
llm = ChatMistralAI(
        model="mistral-large-latest",
        temperature=0,
        max_retries=2,
        # other params...
    )

In [8]:
llm.invoke('who created you?')

AIMessage(content='I was created by Mistral AI.', response_metadata={'token_usage': {'prompt_tokens': 7, 'total_tokens': 15, 'completion_tokens': 8}, 'model': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run-5de10ec5-3165-4559-96bc-11b8b4a33a13-0', usage_metadata={'input_tokens': 7, 'output_tokens': 8, 'total_tokens': 15})

In [22]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="rag-answer-v-reference",
)

View the evaluation results for experiment: 'rag-answer-v-reference-3c61724d' at:
https://smith.langchain.com/o/76b2dc9d-4b98-4e5e-983b-623eb76c0ac6/datasets/62c45b24-543f-4290-aeb6-cb696dd9cb06/compare?selectedSessions=0cf2c847-7349-4625-b365-08df545700f5




0it [00:00, ?it/s]

Error running evaluator <DynamicRunEvaluator answer_evaluator> on run 54318d7d-e66f-4e28-adf7-7ef4afbb546b: HTTPStatusError('Error response 429 while fetching https://api.mistral.ai/v1/chat/completions: {"message":"Requests rate limit exceeded"}')
Traceback (most recent call last):
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\_runner.py", line 1357, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\evaluator.py", line 327, in evaluate_run
    result = self.func(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 612, in wrapper
    raise e
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 609, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
  File "<ipython-input-21-830c7579816c>", line 26, in answer_evaluator
    score = answer_grader.invoke({"question": input_quest

### Response vs input

In [23]:
# Grade prompt
grade_prompt_answer_helpfulness = hub.pull("langchain-ai/rag-answer-helpfulness")

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer helpfulness
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["question"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatMistralAI(
        model="mistral-large-latest",
        temperature=0,
        max_retries=2,
        # other params...
    )



    # Structured prompt
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_helpfulness_score", "score": score}

In [24]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_helpfulness_evaluator],
    experiment_prefix="rag-answer-helpfulness",
)

View the evaluation results for experiment: 'rag-answer-helpfulness-60953126' at:
https://smith.langchain.com/o/76b2dc9d-4b98-4e5e-983b-623eb76c0ac6/datasets/62c45b24-543f-4290-aeb6-cb696dd9cb06/compare?selectedSessions=9eeef7bd-3168-410e-b45d-32ed240c68a7




0it [00:00, ?it/s]

Error running evaluator <DynamicRunEvaluator answer_helpfulness_evaluator> on run d3a1cadd-d07a-4168-bea6-dd2e39b71ac8: HTTPStatusError('Error response 429 while fetching https://api.mistral.ai/v1/chat/completions: {"message":"Requests rate limit exceeded"}')
Traceback (most recent call last):
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\_runner.py", line 1357, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\evaluator.py", line 327, in evaluate_run
    result = self.func(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 612, in wrapper
    raise e
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 609, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
  File "<ipython-input-23-f5956559c19e>", line 27, in answer_helpfulness_evaluator
    score = answer_grader.invoke(

### Response vs retrieved docs

In [25]:
# Prompt
grade_prompt_hallucinations = hub.pull("langchain-ai/rag-answer-hallucination")

def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["contexts"]

    # RAG answer
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatMistralAI(
        model="mistral-large-latest",
        temperature=0,
        max_retries=2,
        # other params...
    )

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}

In [26]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-answer-hallucination",
)

View the evaluation results for experiment: 'rag-answer-hallucination-f6b4e418' at:
https://smith.langchain.com/o/76b2dc9d-4b98-4e5e-983b-623eb76c0ac6/datasets/62c45b24-543f-4290-aeb6-cb696dd9cb06/compare?selectedSessions=1af90b89-bfe5-4c88-afda-c938d6a327da




0it [00:00, ?it/s]

Error running target function: string indices must be integers
Error running evaluator <DynamicRunEvaluator answer_hallucination_evaluator> on run 238b353d-0a38-48c9-ad53-36a03ae7643c: KeyError('contexts')
Traceback (most recent call last):
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\_runner.py", line 1357, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\evaluator.py", line 327, in evaluate_run
    result = self.func(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 612, in wrapper
    raise e
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 609, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
  File "<ipython-input-25-bf043159ce4b>", line 11, in answer_hallucination_evaluator
    contexts = run.outputs["contexts"]
KeyError: 'contexts'
Error running target func

### Retrieved docs vs input

In [27]:
# Grade prompt
grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")

def docs_relevance_evaluator(run, example) -> dict:
    """
    A simple evaluator for document relevance
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["contexts"]

    # LLM grader
    llm = ChatMistralAI(
        model="mistral-large-latest",
        temperature=0,
        max_retries=2,
        # other params...
    )

    # Structured prompt
    answer_grader = grade_prompt_doc_relevance | llm

    # Get score
    score = answer_grader.invoke({"question":input_question,
                                  "documents":contexts})
    score = score["Score"]

    return {"key": "document_relevance", "score": score}

In [28]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="rag-doc-relevance",
)

View the evaluation results for experiment: 'rag-doc-relevance-ebbd0690' at:
https://smith.langchain.com/o/76b2dc9d-4b98-4e5e-983b-623eb76c0ac6/datasets/62c45b24-543f-4290-aeb6-cb696dd9cb06/compare?selectedSessions=29432ea3-92cc-4199-8c42-29d496087df2




0it [00:00, ?it/s]

Error running target function: 'choices'
Error running evaluator <DynamicRunEvaluator docs_relevance_evaluator> on run 3f44c1e8-047f-479b-aea6-0d840f233a8b: KeyError('contexts')
Traceback (most recent call last):
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\_runner.py", line 1357, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\evaluation\evaluator.py", line 327, in evaluate_run
    result = self.func(
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 612, in wrapper
    raise e
  File "C:\Program Files\Python38\lib\site-packages\langsmith\run_helpers.py", line 609, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
  File "<ipython-input-27-15f1a8fa1611>", line 11, in docs_relevance_evaluator
    contexts = run.outputs["contexts"]
KeyError: 'contexts'
Error running target function: 'choices'
Error running eval