In [2]:
!pip install deepeval

Collecting deepeval
  Downloading deepeval-3.5.4-py3-none-any.whl.metadata (18 kB)
Collecting anthropic (from deepeval)
  Downloading anthropic-0.68.0-py3-none-any.whl.metadata (28 kB)
Collecting ollama (from deepeval)
  Downloading ollama-0.5.4-py3-none-any.whl.metadata (4.3 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc<2.0.0,>=1.24.0 (from deepeval)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting posthog<7.0.0,>=6.3.0 (from deepeval)
  Downloading posthog-6.7.5-py3-none-any.whl.metadata (6.0 kB)
Collecting pyfiglet (from deepeval)
  Downloading pyfiglet-1.0.4-py3-none-any.whl.metadata (7.4 kB)
Collecting pytest-asyncio (from deepeval)
  Downloading pytest_asyncio-1.2.0-py3-none-any.whl.metadata (4.1 kB)
Collecting pytest-repeat (from deepeval)
  Downloading pytest_repeat-0.9.4-py3-none-any.whl.metadata (4.9 kB)
Col

In [3]:
from deepeval import evaluate
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

In [6]:
import os
from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

####Test Correctness

In [8]:
correctness_metric = GEval(
    name="Correctness",
    model="gpt-4.1",
    evaluation_params=[
        LLMTestCaseParams.EXPECTED_OUTPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT],
        evaluation_steps=[
        "Determine whether the actual output is factually correct based on the expected output."
    ],

)

gt_answer = "Madrid is the capital of Spain."
pred_answer = "MadriD."

test_case_correctness = LLMTestCase(
    input="What is the capital of Spain?",
    expected_output=gt_answer,
    actual_output=pred_answer,
)

correctness_metric.measure(test_case_correctness)
print(correctness_metric.score)

Output()

0.16791787121629465


####Test faithfulness

In [9]:
question = "what is 3+3?"
context = ["6"]
generated_answer = "6"

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4.1",
    include_reason=False
)

test_case = LLMTestCase(
    input = question,
    actual_output=generated_answer,
    retrieval_context=context

)

faithfulness_metric.measure(test_case)
print(faithfulness_metric.score)
print(faithfulness_metric.reason)

Output()

1.0
None


####Test contextual relevancy

In [10]:
actual_output = "then go somewhere else."
retrieval_context = ["this is a test context","mike is a cat","if the shoes don't fit, then go somewhere else."]
gt_answer = "if the shoes don't fit, then go somewhere else."

relevance_metric = ContextualRelevancyMetric(
    threshold=1,
    model="gpt-4.1-nano",
    include_reason=True
)
relevance_test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output,
    retrieval_context=retrieval_context,
    expected_output=gt_answer,

)

relevance_metric.measure(relevance_test_case)
print(relevance_metric.score)
print(relevance_metric.reason)

Output()

0.25
The score is 0.25 because the only relevant statement in the context is 'if the shoes don't fit, then go somewhere else,' which directly addresses the question about shoes not fitting, making it somewhat relevant despite the lack of additional context.


In [11]:
new_test_case = LLMTestCase(
    input="What is the capital of Spain?",
    expected_output="Madrid is the capital of Spain.",
    actual_output="MadriD.",
    retrieval_context=["Madrid is the capital of Spain."]
)