In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A Practical Guide with Gen AI Evaluation Serivce


## Overview

This notebook demonstrates how to use the Vertex AI SDK for Gen AI Evaluation Service to compare two first-party models, for example, when considering a migration, and optimizing prompts. We will use various predefined adaptive rubric-based metrics as well as your own metric. Additionally, we'll touch upon how evaluation results can guide prompt optimization.

---

Key features highlighted in this notebook include:


*   **A Complete Evaluation Workflow**: The SDK provides a seamless experience from generating model responses with `run_inference()` to detailed assessment with `evaluate()`.

*   **Flexible, Multi-Candidate Evaluation**: Easily analyze and compare the performance of multiple AI models, agents, or configurations in a single run. The SDK provides a unified report with comprehensive results and win-rate calculations for all contenders.

*   **Rich In-Notebook Visualization**: The `.show()` method, available on both `EvaluationDataset` and `EvaluationResult` objects, renders an interactive HTML report for analysis directly within your Colab and Jupyter notebooks.


*   **Integrated Prompt Optimization**: Iteratively improve your prompts using the built-in `prompt_optimizer` module and immediately re-evaluate to quantify the impact of your changes.


### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started



In [None]:
# @title ### Install Vertex AI SDK for Gen AI Evaluation Service

%pip install --upgrade "google-cloud-aiplatform[evaluation]>=1.111.0" --force-reinstall --quiet --no-warn-conflicts

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# @title ### Authenticate your notebook environment (Colab only)
# @markdown If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
# @title ### Set Google Cloud project information
# @markdown To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).
# @markdown Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

# @markdown ---

import os
PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))
LOCATION= "us-central1"  # @param {type: "string", placeholder: "us-central1", isTemplate: true}
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", LOCATION)


from vertexai import Client, types
client = Client(project=PROJECT_ID, location=LOCATION)

In [None]:
# @title ### Prepare Dataset and Generate Rubrics
# @markdown Rubrics would be saved in a group named `general_quality_rubrics`.

import pandas as pd

instruction = "Summarize the following article"
dataset = pd.read_csv('test.csv')

dataset.drop(columns = ['id'],inplace = True)

prompt_template="Instruction: {instruction} such that you're explaining it to a 5 year old. Article: {article}. Summary:"
prompt_template = prompt_template.replace("{instruction}",instruction)

print(dataset.isnull().sum())
print(dataset.shape)

dataset.head()
# eval_dataset = dataset.sample(5)

# eval_dataset = pd.DataFrame(
#     {
#         "prompt": [instruction + item for item in context],
#         "reference": reference,
#     }
# )
eval_dataset = dataset.sample(5)



article      0
reference    0
dtype: int64
(50, 2)


In [None]:
eval_dataset.head()

Unnamed: 0,article,reference
44,Former Queens Park Rangers chairman Gianni Pal...,Gianni Paladini has registered as director of ...
42,London commuters faced traffic chaos this morn...,Main roads in Holborn are closed more than 24 ...
22,Princess Beatrice has been seen at the Bahrain...,Beatrice seen watching race on terrace with th...
31,"Barely out of nappies, this five-year-old boy ...","Boy, five, pelts police with stones as Palesti..."
14,A thief nicknamed the 'Black Widow of Facebook...,"Sofia Davila, 21, nicknamed the 'Black Widow o..."


In [None]:
rubrics = client.evals.generate_rubrics(
    src=pd.DataFrame({"prompt":[prompt_template]}),
    rubric_group_name="general",
    predefined_spec_name=types.RubricMetric.GENERAL_QUALITY,
)
rubrics.show()

In [None]:
print(rubrics.eval_dataset_df['rubric_groups'][0])

{'general': [Rubric(
  content=RubricContent(
    property=RubricContentProperty(
      description='The response is in English.'
    )
  ),
  importance=<Importance.HIGH: 'HIGH'>,
  type='LANGUAGE:PRIMARY_RESPONSE_LANGUAGE'
), Rubric(
  content=RubricContent(
    property=RubricContentProperty(
      description='The response is a summary of the provided article.'
    )
  ),
  importance=<Importance.HIGH: 'HIGH'>,
  type='CONTENT_REQUIREMENT:SUMMARY'
), Rubric(
  content=RubricContent(
    property=RubricContentProperty(
      description='The summary is written in a style and tone appropriate for explaining to a 5-year-old.'
    )
  ),
  importance=<Importance.HIGH: 'HIGH'>,
  type='CONTENT_REQUIREMENT:STYLE_AND_TONE:CHILD_APPROPRIATE'
), Rubric(
  content=RubricContent(
    property=RubricContentProperty(
      description='The summary uses simple vocabulary and short, easy-to-understand sentences.'
    )
  ),
  importance=<Importance.HIGH: 'HIGH'>,
  type='CONTENT_REQUIREMENT:LANGU

In [None]:
eval_dataset.head()

Unnamed: 0,article,reference
44,Former Queens Park Rangers chairman Gianni Pal...,Gianni Paladini has registered as director of ...
42,London commuters faced traffic chaos this morn...,Main roads in Holborn are closed more than 24 ...
22,Princess Beatrice has been seen at the Bahrain...,Beatrice seen watching race on terrace with th...
31,"Barely out of nappies, this five-year-old boy ...","Boy, five, pelts police with stones as Palesti..."
14,A thief nicknamed the 'Black Widow of Facebook...,"Sofia Davila, 21, nicknamed the 'Black Widow o..."


In [None]:
# @title ## 1. Initial Evaluation (Baseline)
# @markdown First, we establish a baseline score. We will use an initial set of prompts and the `gemini-2.5-flash` or another Gemini model to generate responses. Then, we will evaluate these responses to get our baseline quality score.

# @markdown ---
# @markdown **Base Model**
MODEL_ID = "gemini-2.5-flash"  # @param {type: "string"}
# @markdown ---

data_with_rubrics = eval_dataset
data_with_rubrics["prompt"] = data_with_rubrics["article"].apply(lambda x: prompt_template.format( article=x))
data_with_rubrics["rubric_groups"] = [rubrics.eval_dataset_df['rubric_groups'][0]] * len(data_with_rubrics)
print(f"Generating responses for {MODEL_ID}...")
baseline = client.evals.run_inference(
    model=MODEL_ID,
    src=data_with_rubrics,
    config={
        "generate_content_config": {"temperature": 1.6}
    }
)
baseline.show()



Generating responses for gemini-2.5-flash...


Gemini Inference: 100%|██████████| 5/5 [00:11<00:00,  2.40s/it]


In [None]:
# @title ### Run Baseline Evaluation with Adaptive Rubrics
baseline_eval_result = client.evals.evaluate(
    dataset=baseline,
    metrics=[
      types.RubricMetric.GENERAL_QUALITY(rubric_group_name="general",)
        # types.RubricMetric.TEXT_QUALITY,
        # types.RubricMetric.FLUENCY,
        # types.Metric(name='rouge_1'),
    ],
)
baseline_eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 5/5 [00:18<00:00,  3.74s/it]


In [None]:
# @title ## 2. Model Comparison

# @markdown Next, we'll see if changing the model can improve the results further. We will use our optimized prompts with a more economical model, `gemini-2.5-flash-lite`, and compare the evaluation scores.

# @markdown ---
# @markdown **Second Model**
MODEL_2_ID = "gemini-2.5-flash-lite"  # @param {type: "string"}
# @markdown ---

print(f"Generating responses for {MODEL_2_ID}...")
comparison = client.evals.run_inference(
    model=MODEL_2_ID,
    src=data_with_rubrics,
    config={
        "generate_content_config": {"temperature": 1.6}
    }
)
# comparison.show()


Generating responses for gemini-2.5-flash-lite...


Gemini Inference: 100%|██████████| 5/5 [00:02<00:00,  2.11it/s]


In [None]:
# @title ### Evaluate and Compare
# @markdown Use a list of datasets to compare the candidates.

comparison_result = client.evals.evaluate(
    dataset=[baseline, comparison],
    metrics=[
        types.RubricMetric.GENERAL_QUALITY(
            rubric_group_name="general_quality_rubrics",
        ),
    ]
)
comparison_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 10/10 [00:15<00:00,  1.53s/it]
  PydanticSerializationUnexpectedValue(Expected `WinRateStats` - serialized value may not be as expected [input_value={'win_rates': [0.0, 0.4], 'tie_rate': 0.6}, input_type=dict])
  return self.__pydantic_serializer__.to_python(


In [None]:
# @title ##3. Optimize Prompt

# @markdown Now, let's try to improve our results by optimizing the prompts. A well-crafted prompt can significantly improve the quality of the model's responses. We will add more specific instructions and context to our prompts.

optimized_prompt = client.prompt_optimizer.optimize_prompt(prompt=prompt_template.replace("{instruction}",instruction))
optimized_instruction = optimized_prompt.parsed_response.suggested_prompt
print(optimized_instruction)

# ROLE
You are an expert at explaining complex topics to young children in a simple, engaging, and easy-to-understand way.

# TASK
Summarize the provided article.

# INSTRUCTIONS
- Your summary must be simple enough for a 5-year-old to understand.
- Use very simple vocabulary and short, clear sentences.
- Use relatable analogies that a child can understand (e.g., "It's like...").
- Maintain a friendly and encouraging tone.

# CONSTRAINTS
- The summary should be 2-4 sentences long.

# EXAMPLE
Article: "Photosynthesis is a process used by plants to convert light energy into chemical energy. By using sunlight, water, and carbon dioxide, they create glucose, which is their food, and release oxygen into the atmosphere."
Summary: "Plants eat sunshine to make their food! It's like they are little chefs. They use the sun, water, and air to make their own snacks and give us the fresh air we need to breathe."

---

Article: {article}
Summary:


In [None]:
# @markdown Let's look at the explainations of the optimization

print(optimized_prompt.parsed_response.model_dump_json(indent=2))

{
  "optimization_type": "zero_shot_prompt_optimization",
  "applicable_guidelines": [
    {
      "applicable_guideline": "Role",
      "suggested_improvement": "The prompt lacks a specific role for the AI. Assigning a persona, like an expert in explaining topics to children, helps prime the model for the desired tone and complexity level.",
      "text_before_change": "(The prompt does not define a role for the AI)",
      "text_after_change": "# ROLE\nYou are an expert at explaining complex topics to young children in a simple, engaging, and easy-to-understand way."
    },
    {
      "applicable_guideline": "Ambiguity",
      "suggested_improvement": "The instruction 'explaining it to a 5 year old' is an ambiguous idiom. It should be replaced with explicit, actionable instructions regarding vocabulary, sentence structure, and the use of analogies to ensure the desired level of simplicity.",
      "text_before_change": "...such that you're explaining it to a 5 year old.",
      "tex

In [None]:
optimized_data_with_rubrics = data_with_rubrics
optimized_instruction_formatted = optimized_instruction.replace("{{","{").replace("}}","}")
print(optimized_instruction)

optimized_data_with_rubrics["prompt"] = optimized_data_with_rubrics["article"].apply(lambda x: optimized_instruction_formatted.format(article=x))


print(f"Generating responses for {MODEL_ID} with optimized prompts...")
optimized = client.evals.run_inference(
    model=MODEL_ID,
    src=optimized_data_with_rubrics,
    config={
        "generate_content_config": {"temperature": 1.6}
    }
)
optimized.show()

# ROLE
You are an expert at explaining complex topics to young children in a simple, engaging, and easy-to-understand way.

# TASK
Summarize the provided article.

# INSTRUCTIONS
- Your summary must be simple enough for a 5-year-old to understand.
- Use very simple vocabulary and short, clear sentences.
- Use relatable analogies that a child can understand (e.g., "It's like...").
- Maintain a friendly and encouraging tone.

# CONSTRAINTS
- The summary should be 2-4 sentences long.

# EXAMPLE
Article: "Photosynthesis is a process used by plants to convert light energy into chemical energy. By using sunlight, water, and carbon dioxide, they create glucose, which is their food, and release oxygen into the atmosphere."
Summary: "Plants eat sunshine to make their food! It's like they are little chefs. They use the sun, water, and air to make their own snacks and give us the fresh air we need to breathe."

---

Article: {article}
Summary:
Generating responses for gemini-2.5-flash with optimi

Gemini Inference: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]


In [None]:
# @title ### Compare the improvements

optimized_eval_result = client.evals.evaluate(
    dataset=optimized,
    metrics=[
        types.RubricMetric.GENERAL_QUALITY(
            rubric_group_name="general_quality_rubrics",
        ),
    ]
)
optimized_eval_result.show()

# optimized_eval_result = client.evals.evaluate(dataset=optimized)
# optimized_eval_result.show()
# # baseline_eval_result = client.evals.evaluate(dataset=baseline)
# # baseline_eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 5/5 [00:13<00:00,  2.68s/it]


In [None]:
# @title ## 3. Use your own metric

coverage_metric = types.LLMMetric(
    name="coverage",
    prompt_template=types.MetricPromptBuilder(
      instruction="Evaluate the summary's completeness in summarizing the blog article.",
      criteria={
          "Coverage": "Measure how much of the original blog article is included in the summary."
      },
      rating_scores={
          "5": "Excellent. The summary contains every detail contained in the original blog article.",
          "4": "Good. The summary contains all of the significant details contained in the original blog article, with only minor details left out.",
          "3": "Fair. The summary contains most of the significant details in the original article, but there are still some significant details that are missing.",
          "2": "Poor. The summary contains a few of the significant details in the original article, but is missing a significant portion of the important details.",
          "1": "Unsatisfactory. The summary contains some minor details of the original blog article, but is missing most or all of the significant details."
      }
    )
)

custom_metric_eval_result = client.evals.evaluate(
    dataset=baseline,
    metrics=[
        coverage_metric,
    ]
)
custom_metric_eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 5/5 [00:21<00:00,  4.35s/it]


In [None]:
custom_metric_eval_result = client.evals.evaluate(
    dataset=optimized,
    metrics=[
        coverage_metric,
    ]
)
custom_metric_eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 5/5 [00:19<00:00,  3.86s/it]
