In [3]:
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI
from trulens.core import Feedback
from typing import List, Dict
import os



In [4]:
# 1. First, create your ground truth dataset
ground_truths = [
    {
        "query": "What is the capital of France?",
        "expected_response": "The capital of France is Paris."
    },
    {
        "query": "Who wrote Romeo and Juliet?",
        "expected_response": "William Shakespeare wrote Romeo and Juliet."
    }
]

# 2. Initialize the GroundTruthAgreement with your provider
ground_truth = GroundTruthAgreement(
    ground_truths,
    provider=OpenAI(api_key = os.getenv("OPENAI_API_KEY"))  # Or your preferred provider
)



In [5]:
# 3. Create feedback functions using different evaluation methods

# Using semantic agreement (GPT-based evaluation)
f_agreement = Feedback(ground_truth.agreement_measure).on_input_output()

# Using BERT Score for semantic similarity
f_bert = Feedback(ground_truth.bert_score).on_input_output()

# Using BLEU score for token overlap
f_bleu = Feedback(ground_truth.bleu).on_input_output()

# Using ROUGE score
f_rouge = Feedback(ground_truth.rouge).on_input_output()



✅ In agreement_measure, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In agreement_measure, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In bert_score, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In bert_score, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In bleu, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In bleu, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In rouge, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In rouge, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [6]:
# 4. For evaluating retrieved contexts specifically
def evaluate_retrieval(
    query: str,
    retrieved_chunks: List[str],
    relevance_scores: List[float] = None,
    k: int = None
):
    # Calculate various retrieval metrics
    ndcg = ground_truth.ndcg_at_k(query, retrieved_chunks, relevance_scores, k)
    precision = ground_truth.precision_at_k(query, retrieved_chunks, relevance_scores, k)
    recall = ground_truth.recall_at_k(query, retrieved_chunks, relevance_scores, k)
    mrr = ground_truth.mrr(query, retrieved_chunks, relevance_scores)
    hit_rate = ground_truth.ir_hit_rate(query, retrieved_chunks, k)
    
    return {
        "ndcg@k": ndcg,
        "precision@k": precision,
        "recall@k": recall,
        "mrr": mrr,
        "hit_rate@k": hit_rate
    }



In [None]:
# 5. Use with your RAG application
with tru.recorder() as recording:
    # Your RAG query execution here
    response = rag_app(query)
    
    # Get feedback scores
    agreement_score = f_agreement(query, response)
    bert_score = f_bert(query, response)
    bleu_score = f_bleu(query, response)
    rouge_score = f_rouge(query, response)
    
    # If you want to evaluate retrieval specifically
    retrieval_metrics = evaluate_retrieval(
        query=query,
        retrieved_chunks=rag_app.get_retrieved_chunks(),  # Your retrieval function
        k=5  # Evaluate top-5 results
    )