# Consolidated DeepEval Test Execution

This notebook aggregates the DeepEval-based evaluation scenarios defined in the repository's Python test files. Each metric-oriented suite is recreated with a lightweight dataset so that the evaluations can be orchestrated from a single location.


In [None]:
from dotenv import load_dotenv
load_dotenv()

import json

from pydantic import BaseModel
from IPython.display import Markdown, display

from deepeval.metrics import (
    FaithfulnessMetric,
    HallucinationMetric,
    PromptAlignmentMetric,
    SummarizationMetric,
    JsonCorrectnessMetric,
    AnswerRelevancyMetric,
    GEval,
)
from deepeval.test_case import LLMTestCase, LLMTestCaseParams


In [None]:
class ExampleSchema(BaseModel):
    name: str

def as_markdown_table(rows, headers):
    lines = [
        '|' + '|'.join(headers) + '|',
        '|' + '|'.join(['---'] * len(headers)) + '|',
    ]
    for row in rows:
        values = [str(row.get(h, '') if row.get(h, '') is not None else '') for h in headers]
        lines.append('|' + '|'.join(values) + '|')
    return '
'.join(lines)

dataset = [
    {
        'test_suite': 'Faithfulness',
        'metric_name': 'FaithfulnessMetric',
        'metric_factory': lambda: FaithfulnessMetric(threshold=0.7, model='gpt-4o-mini', include_reason=True),
        'cases': [
            {
                'case_id': 'faithfulness_case_1',
                'llm_test_case_kwargs': {
                    'input': "What if these shoes don't fit?",
                    'actual_output': "We offer a 30-day full refund at no extra cost.",
                    'retrieval_context': ["All customers are eligible for a 30 day full refund at no extra cost."]
                }
            }
        ],
    },
    {
        'test_suite': 'Hallucination',
        'metric_name': 'HallucinationMetric',
        'metric_factory': lambda: HallucinationMetric(threshold=0.5, model='gpt-4o-mini', include_reason=True),
        'cases': [
            {
                'case_id': 'hallucination_case_1',
                'llm_test_case_kwargs': {
                    'input': "What was the blond doing?",
                    'actual_output': "A blond drinking water in public.",
                    'context': ["A man with blond-hair, and a brown shirt drinking out of a public water fountain."]
                }
            }
        ],
    },
    {
        'test_suite': 'Prompt Alignment',
        'metric_name': 'PromptAlignmentMetric',
        'metric_factory': lambda: PromptAlignmentMetric(prompt_instructions=["Reply in all uppercase"], model='gpt-4o-mini', include_reason=True),
        'cases': [
            {
                'case_id': 'prompt_alignment_case_1',
                'llm_test_case_kwargs': {
                    'input': "What is capital of India?",
                    'actual_output': "THE CAPITAL OF INDIA IS NEW DELHI."
                }
            }
        ],
    },
    {
        'test_suite': 'Summarization',
        'metric_name': 'SummarizationMetric',
        'metric_factory': lambda: SummarizationMetric(threshold=0.7, model='gpt-4o-mini'),
        'cases': [
            {
                'case_id': 'summarization_case_1',
                'llm_test_case_kwargs': {
                    'input': "Rice is the staple food of Bengal. Bhortas (lit-\"mashed\") are a really common type of food used as an additive too rice. there are several types of Bhortas such as Ilish bhorta shutki bhorta, begoon bhorta and more. Fish and other seafood are also important because Bengal is a reverrine region.\nSome fishes like puti (Puntius species) are fermented. Fish curry is prepared with fish alone or in combination with vegetables. Shutki maach is made using the age-old method of preservation where the food item is dried in the sun and air, thus removing the water content. This allows for preservation that can make the fish last for months, even years in Bangladesh",
                    'actual_output': "Bengali cuisine centers on rice and diverse mashed accompaniments called bhortas, along with plentiful fish dishes that are often dried or fermented to extend their shelf life."
                }
            }
        ],
    },
    {
        'test_suite': 'GEval Correctness',
        'metric_name': 'GEval',
        'metric_factory': lambda: GEval(
            name='Correctness',
            model='gpt-4o-mini',
            evaluation_params=[
                LLMTestCaseParams.EXPECTED_OUTPUT,
                LLMTestCaseParams.ACTUAL_OUTPUT,
            ],
            evaluation_steps=[
                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
                "Lightly penalize omission of detail while focusing on the main idea",
                "Vague language, or contradicting opinions, are acceptable",
            ],
        ),
        'cases': [
            {
                'case_id': 'geval_case_1',
                'llm_test_case_kwargs': {
                    'input': "What are the main causes of deforestation?",
                    'actual_output': "The main causes of deforestation include agricultural expansion, logging, infrastructure development, and urbanization.",
                    'expected_output': "The main causes of deforestation include agricultural expansion, logging, infrastructure development, and urbanization."
                }
            },
            {
                'case_id': 'geval_case_2',
                'llm_test_case_kwargs': {
                    'input': "Define the term 'artificial intelligence'.",
                    'actual_output': "Artificial intelligence is the simulation of human intelligence by machines.",
                    'expected_output': "Artificial intelligence refers to the simulation of human intelligence in machines that are programmed to think and learn like humans, including tasks such as problem-solving, decision-making, and language understanding."
                }
            },
            {
                'case_id': 'geval_case_3',
                'llm_test_case_kwargs': {
                    'input': "List the primary colors.",
                    'actual_output': "The primary colors are green, orange, and purple.",
                    'expected_output': "The primary colors are red, blue, and yellow."
                }
            },
        ],
    },
    {
        'test_suite': 'JSON Correctness',
        'metric_name': 'JsonCorrectnessMetric',
        'metric_factory': lambda: JsonCorrectnessMetric(expected_schema=ExampleSchema, model='gpt-4o-mini', include_reason=True),
        'cases': [
            {
                'case_id': 'json_correctness_case_1',
                'llm_test_case_kwargs': {
                    'input': "Output me a random Json with the 'name' key",
                    'actual_output': '{\"name\": \"A Random Name\"}'
                }
            }
        ],
    },
    {
        'test_suite': 'Answer Relevancy',
        'metric_name': 'AnswerRelevancyMetric',
        'metric_factory': lambda: AnswerRelevancyMetric(threshold=0.7, model='gpt-4o-mini'),
        'cases': [
            {
                'case_id': 'relevancy_case_1',
                'llm_test_case_kwargs': {
                    'input': "Can I return these shoes after 30 days?",
                    'actual_output': "Yes, you can return them. We offer a 30-day full refund. Do you have your original receipt?",
                    'retrieval_context': [
                        "All customers are eligible for a 30-day full refund at no extra cost.",
                        "Returns are only accepted within 30 days of purchase.",
                    ],
                }
            },
            {
                'case_id': 'relevancy_case_2',
                'llm_test_case_kwargs': {
                    'input': "Can I return these shoes after 30 days?",
                    'actual_output': "Unfortunately, returns are only accepted within 30 days of purchase.",
                    'retrieval_context': [
                        "All customers are eligible for a 30-day full refund at no extra cost.",
                        "Returns are only accepted within 30 days of purchase.",
                    ],
                }
            },
        ],
    },
]


## Dataset Overview

The structure below mirrors the original test scenarios so that each metric can be executed programmatically.


In [None]:
dataset_rows = []
for entry in dataset:
    for case in entry['cases']:
        row = {
            'test_suite': entry['test_suite'],
            'metric': entry['metric_name'],
            'case_id': case['case_id'],
            'input': case['llm_test_case_kwargs'].get('input'),
            'actual_output': case['llm_test_case_kwargs'].get('actual_output'),
            'additional_args': json.dumps({k: v for k, v in case['llm_test_case_kwargs'].items() if k not in ['input', 'actual_output']})
        }
        dataset_rows.append(row)

headers = ['test_suite', 'metric', 'case_id', 'input', 'actual_output', 'additional_args']
table_md = as_markdown_table(dataset_rows, headers)
display(Markdown(table_md))


## Metric Execution

Each case is evaluated using its corresponding metric. The resulting scores, reasoning (when provided by the metric), and execution status are consolidated below.


In [None]:
results = []

for entry in dataset:
    for case in entry['cases']:
        metric = entry['metric_factory']()
        test_case = LLMTestCase(**case['llm_test_case_kwargs'])
        try:
            metric.measure(test_case)
            score = getattr(metric, 'score', None)
            reason = getattr(metric, 'reason', '')
            status = 'success'
        except Exception as exc:
            score = None
            reason = str(exc)
            status = 'error'
        results.append({
            'test_suite': entry['test_suite'],
            'metric': entry['metric_name'],
            'case_id': case['case_id'],
            'score': score,
            'reason_or_error': reason,
            'status': status,
        })

result_headers = ['test_suite', 'metric', 'case_id', 'score', 'reason_or_error', 'status']
results_table_md = as_markdown_table(results, result_headers)
display(Markdown(results_table_md))
