# RAG EVALUATION 

In [None]:
!pip install deepeval

## Import

In [15]:
# Importing the necessary evaluation metrics from the deepeval library
from deepeval.metrics import ContextualPrecisionMetric  
from deepeval.metrics import ContextualRecallMetric     
from deepeval.metrics import AnswerRelevancyMetric      
from deepeval.metrics import FaithfulnessMetric    
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
import os


## Metrics configuration and initialization

In [None]:

# Defining the environment variable for OpenAI API key
os.environ["OPENAI_API_KEY"] = "your-key-here"

# Initializing test inputs
actual_output = ""       
expected_output = ""    
retrieval_context = [] 

# Initializing the ContextualPrecisionMetric 
metric_CP = ContextualPrecisionMetric(
    threshold = 0.7,       
    model = "gpt-4o-mini",       
    include_reason = True 
)

# Initializing the ContextualRecallMetric 
metric_CR = ContextualRecallMetric(
    threshold = 0.7,        
    model = "gpt-4o-mini",        
    include_reason = True   
)


# Initializing the AnswerRelevancyMetric 
metric_AR = AnswerRelevancyMetric(
    threshold = 0.7,        
    model = "gpt-4o-mini",        
    include_reason = True   
)


# Initializing the FaithfulnessMetric 
metric_F = FaithfulnessMetric(
    threshold = 0.7,        
    model = "gpt-4o-mini",        
    include_reason = True   
)


# Creating a test case with the necessary inputs to evaluate the metric
test_case = LLMTestCase(
    input = "Where is the tour eiffel?",           
    actual_output = actual_output, 
    expected_output = expected_output,  
    retrieval_context = retrieval_context  
)

## Executing test

In [None]:
# Creating a test case to evaluate contextual precision (CP)
test_case_CP = LLMTestCase(
    input = "",           
    actual_output = actual_output, 
    expected_output = expected_output,  
    retrieval_context = retrieval_context  
)

# Creating a test case to evaluate contextual recall (CR)
test_case_CR = LLMTestCase(
    input = "",           
    actual_output = actual_output, 
    expected_output = expected_output,  
    retrieval_context = retrieval_context  
)

# Creating a test case to evaluate answer relevancy (AR)
test_case_AR = LLMTestCase(
    input = "",           
    actual_output = actual_output,  
)

# Creating a test case to evaluate faithfulness (F)
test_case_F = LLMTestCase(
    input = "",           
    actual_output = actual_output,  
    retrieval_context = retrieval_context  
)

# Measuring the evaluation score for the test case
metric_CP.measure(test_case_CP)
metric_CR.measure(test_case_CR)  
metric_AR.measure(test_case_AR)  
metric_F.measure(test_case_F)  

# Evaluating the test case with the metric and displaying results
evaluate([test_case_CP], [metric_CP])  
evaluate([test_case_CR], [metric_CR])  
evaluate([test_case_AR], [metric_AR])  
evaluate([test_case_F], [metric_F])  

# Printing results
print(f"metric CP score: {metric_CP.score}") 
print(f"metric CP reason: {metric_CP.reason}") 

# Printing the score and reasoning behind it
print(f"metric CR score: {metric_CR.score}") 
print(f"metric CR reason: {metric_CR.reason}") 

# Printing the score and reasoning behind it
print(f"metric AR score: {metric_AR.score}") 
print(f"metric AR reason: {metric_AR.reason}") 

# Printing the score and reasoning behind it
print(f"metric F score: {metric_F.score}") 
print(f"metric F reason: {metric_F.reason}") 