# TASK 3

Classification

* Create a Python function that uses Gemini to classify user questions into one of the following
categories: Employment, General Information, Emergency Services, or Tax Related

* Create a second function that generates social media posts for government announcements like
weather emergencies, holidays, school closings, etc.

Write unit tests for each function using pytest.

Use the Google Evaluation API to evaluate and compare Gemini responses from different
prompts

Keep track of old dataframe outputs/evals?


In [1]:
!pip install --upgrade --quiet google-cloud-aiplatform google-cloud-aiplatform[evaluation]
!pip install --quiet ipytest

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.1/278.1 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.9/319.9 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
project_id = !gcloud config get project
project_id = project_id[0]

location = "us-central1"



In [2]:
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from vertexai.language_models import TextGenerationModel

import pytest
import ipytest
ipytest.autoconfig()

from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
)


import datetime
import nest_asyncio
import pandas as pd
from IPython.display import display, Markdown, HTML

pd.set_option('display.max_colwidth', None)

  from google.cloud.aiplatform.utils import gcs_utils


In [26]:
eval_results_to_compare = []

In [35]:
# init models and eval_results

vertexai.init(project=project_id, location=location)

#Variable changes if you wish to test the effects of different models and temps
model = "gemini-2.5-flash"
temperature_val= 0



generation_config = {
      "temperature": temperature_val,
      "top_p": 0.4,
}

model = GenerativeModel(
  model,
  generation_config=generation_config,
)

In [4]:
# I wound up picking instruction following
MetricPromptTemplateExamples.list_example_metric_names()


['coherence',
 'fluency',
 'safety',
 'groundedness',
 'instruction_following',
 'verbosity',
 'text_quality',
 'summarization_quality',
 'question_answering_quality',
 'multi_turn_chat_quality',
 'multi_turn_safety',
 'pairwise_coherence',
 'pairwise_fluency',
 'pairwise_safety',
 'pairwise_groundedness',
 'pairwise_instruction_following',
 'pairwise_verbosity',
 'pairwise_text_quality',
 'pairwise_summarization_quality',
 'pairwise_question_answering_quality',
 'pairwise_multi_turn_chat_quality',
 'pairwise_multi_turn_safety']

In [5]:
def foo1(prompt):
  # uses Gemini to classify user questions into one of the following categories:
  # Employment, General Information, Emergency Services, or Tax Related

  if prompt == "" or type(prompt) != str:
    return "N/A"

  instructions = f"""
  classify the user questions into one of the following categories: Employment, General Information, Emergency Services, Tax Related or UNKNOWN.
  Examples --
  User: "I hear you are hiring?"
  Output: "Employment"
  User: "I think there's a fire in the valley!"
  Output: "Emergancy Services
  --
  User: {prompt}
  Output:
  """

  response = model.generate_content(
    contents=[instructions],
  )
  return (response.text)

In [6]:
def foo2(prompt):
  # generates social media posts for government announcements like weather
  # emergencies, holidays, school closings, etc.
  if prompt == "" or type(prompt) != str:
    return None
  instructions = """
  Return a formal and proffesional statement alerting the public based on the user prompt
  Weather, Emergancies, Holidays, School Closing, ect.
  """
  response = model.generate_content(
    contents=[instructions, prompt]
  )
  return response.text


In [None]:
# previous prompt
"""You are incharge of the PR department for the government's social media posts.
                  You make announcements for Weather, Emergancies, Holidays, School Closing, ect.
                  Translate the user prompt into a breif formal and proffesional statement
                  Only return the translated statement ready to be posted"""

"""




In [8]:
# Pytest Basic
def test_foo1_bad_input():

  prompt = 123
  res = foo1(prompt)
  assert res == "N/A"

  prompt = ""
  res = foo1(prompt)
  assert res == "N/A"


  prompt = "She sells sea shells by the sea shore"
  res = foo1(prompt)
  assert res == "UNKNOWN"

def test_foo1():
  res = foo1("Can you tell me a bit about the town?")
  assert res == "General Information"

  res = foo1("Is my W2 form important?")
  assert res == "Tax Related"

def test_2_bad():
  prompt = 123
  res = foo2(prompt)
  assert res == None

  prompt = ""
  res = foo2(prompt)
  assert res == None
ipytest.run('-rP')

[32m.[0m[32m.[0m[32m.[0m[32m                                                                                          [100%][0m
[32m[32m[1m3 passed[0m[32m in 2.68s[0m[0m


<ExitCode.OK: 0>

"But what about evaluations based on feels~" I hear you cry.
Fear not, I will cover that next.
foo2 is more dependent on tone. So it will be the focus of my evaluations.

In [9]:
## SETUP
prompts = [
    "Hurricane warnning for florida",
    "Veterens day comment",
    "Icy Roads - State of Emergancy",
]

eval_dataset = pd.DataFrame({
    "prompt": prompts,
})

In [20]:
## TASK
#Instantiate an EvalTask by associating your dataset & selected metric
eval_task = EvalTask(
  dataset=eval_dataset,
  metrics=[MetricPromptTemplateExamples.Pointwise.INSTRUCTION_FOLLOWING],
)

## TESTING AND COMPARISON HERE
you can change the above code and re-run this here
If you change the prompt update the EVAL task and data set as well.

In [36]:
## Results
eval_result = eval_task.evaluate(
  model=model
)

INFO:vertexai.evaluation._evaluation:Generating a total of 3 responses from Gemini model gemini-2.5-flash.
100%|██████████| 3/3 [00:13<00:00,  4.46s/it]
INFO:vertexai.evaluation._evaluation:All 3 responses are successfully generated from Gemini model gemini-2.5-flash.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 13.377640911998242 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 3 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 3/3 [00:07<00:00,  2.35s/it]
INFO:vertexai.evaluation._evaluation:All 3 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:7.068885298000168 seconds


In [37]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# Access the model name from the GenerativeModel object

eval_entry_name = f"model_{model}_temp_{temperature_val}_time_{current_time}"

eval_results_to_compare.append({
    "name": eval_entry_name,
    "eval_result": eval_result
})

print(f"Added evaluation result: {eval_entry_name}")
display(eval_result.summary_metrics)

Added evaluation result: model_<vertexai.generative_models.GenerativeModel object at 0x7ac9206ca960>_temp_0_time_20251203-201030


{'row_count': 3,
 'instruction_following/mean': np.float64(5.0),
 'instruction_following/std': 0.0}

# Compare results

because I define 'eval_results_to_compare' up at the top You can back track, update the model or the 'instructions' for function2, 'foo2', and can see the comparison here.

In [38]:
for item_dict in eval_results_to_compare:
  print("=========")
  print(f"Evaluation Name: {item_dict['name']}")
  # Assuming 'fluency/score' is the metric of interest from previous outputs
  for item in item_dict['eval_result'].summary_metrics:
    display(item + ":  " + str(item_dict['eval_result'].summary_metrics[item]))


Evaluation Name: model_<vertexai.generative_models.GenerativeModel object at 0x7ac9206ca960>_temp_2_time_20251203-200809


'row_count:  3'

'instruction_following/mean:  5.0'

'instruction_following/std:  0.0'

Evaluation Name: model_<vertexai.generative_models.GenerativeModel object at 0x7ac9206ca960>_temp_2_time_20251203-200856


'row_count:  3'

'instruction_following/mean:  5.0'

'instruction_following/std:  0.0'

Evaluation Name: model_<vertexai.generative_models.GenerativeModel object at 0x7ac9206ca960>_temp_0_time_20251203-201030


'row_count:  3'

'instruction_following/mean:  5.0'

'instruction_following/std:  0.0'