In [None]:
# Installing the necessary tools for the job.
# pytest is for our unit testing requirement and the google-cloud-aiplatform [evaluation]
# library gives us access to the Gen AI Evaluation Service we need for the final requirement.
!pip install --upgrade --quiet pytest google-cloud-aiplatform[evaluation] pandas

import IPython
import time

print("Libraries installed successfully.")
print("Restarting kernel to ensure new libraries are loaded...")

# We need to restart the runtime kernel automatically so that the packages we just installed
# are actually recognized and usable by the system immediately.
time.sleep(2)
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

Libraries installed successfully.
Restarting kernel to ensure new libraries are loaded...


{'status': 'ok', 'restart': True}

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, SafetySetting
# Importing EvalTask refrencing the Gen AI Evaluation Service docs
from vertexai.preview.evaluation import EvalTask, MetricPromptTemplateExamples
# Pulling in the specific evaluation tools directly from the documentation
# This EvalTask library is the core component for satisfying Requirement #5 later on.

import pytest
import pandas as pd
import sys
import os


PROJECT_ID = "qwiklabs-gcp-03-5dc51bd29ec6"
LOCATION = "us-central1"

# Initialize Vertex AI SDK
print(f"Initializing Vertex AI for project: {PROJECT_ID} in {LOCATION}")
vertexai.init(project=PROJECT_ID, location=LOCATION)
# This basically turns the key in the ignition. It connects the notebook to my specific project

# Define the Model we will use for the challenge
MODEL_NAME = "gemini-1.5-flash-001"
model = GenerativeModel(MODEL_NAME)
# Instantiating the model object here to act as our default "engine".
# We might need to override settings for specific functions (like the social media generator),
# but this gives us a baseline to work with.

print(f"Environment setup complete. Using model: {MODEL_NAME}")

Initializing Vertex AI for project: qwiklabs-gcp-03-5dc51bd29ec6 in us-central1
Environment setup complete. Using model: gemini-1.5-flash-001


In [None]:
import enum
from vertexai.generative_models import GenerativeModel, GenerationConfig, SafetySetting

# Re-defining MODEL_NAME locally to ensure this cell runs independently if needed
MODEL_NAME = "gemini-2.5-flash"

class DepartmentCategory(enum.Enum):
    EMPLOYMENT = "Employment"
    GENERAL = "General Information"
    EMERGENCY = "Emergency Services"
    TAX = "Tax Related"
# This is my Strict Typing strategy. By using a Python Enum instead of raw strings,
# I ensure I can't accidentally type "Emploiment" later in the tests and break the logic.
# It makes the code much more robust.

def classify_inquiry(user_question: str) -> str:
    """
    Classifies a citizen inquiry into a specific department category using Gemini.
    """

    # System instructions guide the model's behavior globally
    system_instruction = f"""
    You are an intelligent classification system for the Aurora Bay municipal government.
    Your job is to categorize user inquiries into exactly one of these four categories:

    1. {DepartmentCategory.EMPLOYMENT.value}
    2. {DepartmentCategory.GENERAL.value}
    3. {DepartmentCategory.EMERGENCY.value}
    4. {DepartmentCategory.TAX.value}

    Output Rules:
    - Respond ONLY with the exact category name.
    - Do not add punctuation or filler.
    """
    # Moving the rules into 'system_instruction' is cleaner than stuffing them into the prompt.
    # It separates the business logic from the user input

    try:
        model = GenerativeModel(
            MODEL_NAME,
            system_instruction=[system_instruction]
        )
        # FIX: Removed max_output_tokens parameter completely.
        # This allows the model to use its default maximum (usually 8k+)
        # preventing the "MAX_TOKENS" error during reasoning.
        config = GenerationConfig(
            temperature=0.0,
            candidate_count=1
        )\
        # I set temperature to 0.0 because classification needs to be deterministic, not creative.
        # Crucially, I removed the 'max_output_tokens' limit here. We found that 2.5-flash
        # uses "hidden thought tokens" that were crashing the model when I set tight limits.

        # Explicitly allow content to ensure "Fire" doesn't trigger safety blocks
        safety_config = [
            SafetySetting(
                category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
            ),
            SafetySetting(
                category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
                threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
            ),
        ]
        # This is a necessary safeguard. Without it, legitimate queries about "fires" or
        # "emergencies" might trigger the default safety filters and return nothing.

        response = model.generate_content(
            user_question,
            generation_config=config,
            safety_settings=safety_config
        )

        return response.text.strip()

    except Exception as e:
        return f"Error: {str(e)}"

# --- Verification ---
print("--- Testing Classification Function (No Token Limit) ---")
test_questions = [
    "Where can I find the job application for the road crew?",
    "My house is on fire!",
    "When are property taxes due?",
    "What time does the library close?"
]

for q in test_questions:
    category = classify_inquiry(q)
    print(f"Q: {q}\n -> Category: {category}\n")

--- Testing Classification Function (No Token Limit) ---
Q: Where can I find the job application for the road crew?
 -> Category: Employment

Q: My house is on fire!
 -> Category: Emergency Services

Q: When are property taxes due?
 -> Category: Tax Related

Q: What time does the library close?
 -> Category: General Information



In [None]:
# Segment 3: Social Media Generator

class SocialPlatform(enum.Enum):
    TWITTER = "Twitter (X)"
    FACEBOOK = "Facebook"
    LINKEDIN = "LinkedIn"
# Using Enum again here. It ensures I don't accidentally pass an unsupported platform string
# and allows me to map specific rules (like character limits) to these exact values later.

def generate_social_post(topic: str, platform: SocialPlatform = SocialPlatform.TWITTER) -> str:
    """
    Generates a government social media post for a specific topic and platform.

    Args:
        topic: The subject of the post (e.g., "School closing due to snow").
        platform: The target social media platform (defaults to Twitter).

    Returns:
        str: The generated post text.
    """

    # System instructions establish the Persona
    system_instruction = f"""
    You are the Communications Director for Aurora Bay.
    Your goal is to write clear, authoritative, yet empathetic social media posts.

    Platform Rules:
    - {SocialPlatform.TWITTER.value}: Under 280 chars. Concise. Use 2-3 hashtags.
    - {SocialPlatform.FACEBOOK.value}: Can be longer (3-5 sentences). Warm tone. Use 1-2 hashtags.
    - {SocialPlatform.LINKEDIN.value}: Professional tone. Focus on community impact.

    General Rules:
    - Never invent facts. If details are missing, use brackets like [Date] or [Time].
    - Always include the official town hashtag: #AuroraBay
    """
    # I'm baking the "Persona" and the specific rules directly into the system instructions.
    # This aligns with the prompt engineering best practices from the slides—defining the role up front.

    try:
        # Initialize model with the specific persona
        social_model = GenerativeModel(
            MODEL_NAME,
            system_instruction=[system_instruction]
        )

        # Config: Higher temperature for creativity
        # We removed max_output_tokens to avoid cutting off longer Facebook posts
        config = GenerationConfig(
            temperature=0.7,
            candidate_count=1
        )
        # This is a key difference from the Classification function. I bumped temperature to 0.7
        # because social media needs to sound human and creative, not robotic.
        # I also removed the token limit we struggled with earlier so it has room to write full posts. Issues kept occuring on breaking on this end

        prompt = f"Write a {platform.value} post about: {topic}"

        response = social_model.generate_content(
            prompt,
            generation_config=config
        )

        return response.text.strip()

    except Exception as e:
        return f"Error generating post: {str(e)}"

# --- Verification ---
print("--- Testing Social Media Generator ---")
topics = [
    ("Twitter (X)", "Heavy snow emergency declared. Parking ban in effect."),
    ("Facebook", "Annual 4th of July parade details."),
]

for platform_name, topic in topics:
    # Map string to Enum for testing
    platform_enum = SocialPlatform(platform_name)
    post = generate_social_post(topic, platform=platform_enum)

    print(f"\n[{platform_name}] Topic: {topic}")
    print("-" * 40)
    print(post)
    print("-" * 40)
# Quick loop to verify it actually adapts the tone. I want to see a short Tweet
# and a longer Facebook post to confirm the system instructions are working.

--- Testing Social Media Generator ---

[Twitter (X)] Topic: Heavy snow emergency declared. Parking ban in effect.
----------------------------------------
**Twitter (X):**

Heavy snow emergency declared for Aurora Bay. A parking ban is now in effect. Please move vehicles to help plows clear streets safely. Stay warm & safe! #AuroraBay #SnowEmergency #ParkingBan
----------------------------------------

[Facebook] Topic: Annual 4th of July parade details.
----------------------------------------
Here's a Facebook post about the Annual 4th of July parade details:

Get ready to celebrate, Aurora Bay! Our much-anticipated Annual 4th of July Parade is just around the corner, and we can't wait to see our community come together in patriotic spirit. Join us on [Date] at [Time] as the parade kicks off from [Starting Location] and winds its way through [Route Details], showcasing local organizations, festive floats, and plenty of red, white, and blue! Bring your family, friends, and your bigge

In [None]:
import os

current_project = os.environ.get("GOOGLE_CLOUD_PROJECT") or PROJECT_ID
current_location = LOCATION
# Sticking with gemini-2.5-flash since we confirmed it exists in the environment.
MODEL_NAME = "gemini-2.5-flash"

test_file_content = f"""
import pytest
import enum
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig, SafetySetting

# --- SETUP ---
PROJECT_ID = "{current_project}"
LOCATION = "{current_location}"
MODEL_NAME = "{MODEL_NAME}"

vertexai.init(project=PROJECT_ID, location=LOCATION)

# --- DEFINITIONS ---

class DepartmentCategory(enum.Enum):
    EMPLOYMENT = "Employment"
    GENERAL = "General Information"
    EMERGENCY = "Emergency Services"
    TAX = "Tax Related"

class SocialPlatform(enum.Enum):
    TWITTER = "Twitter (X)"
    FACEBOOK = "Facebook"

# I'm defining explicit Safety Settings here to prevent the model from blocking
# legitimate queries about "fire" or "emergencies" during testing.
SAFETY_CONFIG = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
]

def classify_inquiry(user_question):
    model = GenerativeModel(MODEL_NAME)
    prompt = f'''
    Classify into exactly one category:
    {{DepartmentCategory.EMPLOYMENT.value}}, {{DepartmentCategory.GENERAL.value}},
    {{DepartmentCategory.EMERGENCY.value}}, {{DepartmentCategory.TAX.value}}.
    Input: {{user_question}}
    Output (Category Name Only):
    '''
    # Classification is straightforward, so I'm keeping the config simple.
    response = model.generate_content(
        prompt,
        generation_config=GenerationConfig(temperature=0.0, max_output_tokens=100),
        safety_settings=SAFETY_CONFIG
    )
    return response.text.strip()

def generate_social_post(topic, platform):
    model = GenerativeModel(MODEL_NAME)

    # STRATEGY: MINIMALIST PROMPT (The "Fix")
    # Through trial and error, I found that complex prompts caused the 2.5-flash model
    # to "over-think" and hit token limits before writing the actual post.
    # By stripping out the persona and examples, I force it to be direct.
    prompt = f"Write a {{platform.value}} post about: {{topic}}. Keep it professional, under 280 chars, and include one hashtag."

    # I'm requesting a huge token limit (8192) to try and override any low defaults
    # the environment might be enforcing.
    response = model.generate_content(
        prompt,
        generation_config=GenerationConfig(
            temperature=0.7,
            max_output_tokens=8192
        ),
        safety_settings=SAFETY_CONFIG
    )
    return response.text.strip()

# --- TESTS ---

def test_classification_emergency():
    # Verifying that critical keywords like "fire" route correctly to Emergency.
    question = "There is a fire at the main street library!"
    result = classify_inquiry(question)
    assert DepartmentCategory.EMERGENCY.value in result

def test_classification_tax():
    question = "When is the property tax deadline?"
    result = classify_inquiry(question)
    assert DepartmentCategory.TAX.value in result

def test_twitter_length():
    topic = "Trash collection delayed"
    post = generate_social_post(topic, SocialPlatform.TWITTER)
    # Adding a debug print so if this fails, I can see exactly what the model output.
    print(f"DEBUG OUTPUT: {{post}}")
    # Checking for a valid length: not too short (empty) and not too long (over limit).
    assert len(post) > 10
    assert len(post) < 400

def test_social_content_relevance():
    topic = "Fourth of July Parade"
    post = generate_social_post(topic, SocialPlatform.FACEBOOK)
    # Ensuring the model actually writes about the requested topic.
    assert "July" in post or "Parade" in post or "4th" in post
"""

# Writing this content to a physical file so pytest can run it independently.
with open("test_challenge.py", "w") as f:
    f.write(test_file_content)

print(f"Updated 'test_challenge.py' with Minimalist Prompt Strategy.")

In [None]:
# Segment 5 (Fix): Evaluation using Gemini 2.5 with Robust Config
import pandas as pd
from vertexai.preview.evaluation import EvalTask
from vertexai.generative_models import GenerativeModel, GenerationConfig, SafetySetting

print("Setting up Evaluation Task (Gemini 2.5)...")
print("=" * 60)

# 1. Configuration to prevent crashes
# We apply these settings globally to the model instance
safe_config = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
]
# I'm explicitly defining permissive safety settings here to ensure the Evaluation service
# doesn't mistakenly flag our "Emergency" or "Fire" test cases as dangerous content.

gen_config = GenerationConfig(
    temperature=0.7,
    max_output_tokens=8192 # Force high limit to accommodate "Thinking" tokens
)
# This is the critical fix from our troubleshooting. By forcing the max tokens to 8192,
# I am giving the Gemini 2.5 model enough budget to do its internal reasoning/thinking
# without crashing before it writes the actual social media post.

# 2. Initialize Model with Config
# We use the version we KNOW exists in your lab
EVAL_MODEL = GenerativeModel(
    "gemini-2.5-flash",
    generation_config=gen_config,
    safety_settings=safe_config
)
# Initializing the model object with the config *attached*. The EvalTask will use this
# exact object, so it inherits our safety bypasses and token fixes automatically.

# 3. Define Dataset
eval_dataset = pd.DataFrame({
    "topic": [
        "School closed due to heavy snow",
        "Town Hall closed for Thanksgiving",
        "Trash collection delayed by one day"
    ],
    "reference": [
        "ALERT: All Aurora Bay schools are closed today due to snow. Stay safe! #AuroraBay",
        "Town Hall is closed Thursday & Friday for Thanksgiving. Emergency services remain available. #AuroraBay",
        "NOTICE: Trash collection is delayed one day this week due to the holiday. #AuroraBay"
    ]
})
# Creating a small "Golden Dataset"  using pandas. The reference column serves
# as the Ground Truth that the rouge metric will compare the model's output against.

metrics = ["rouge_l", "coherence"]
# Selecting my metrics:
# - 'rouge_l' is a Computed Metric to check if the keywords match our reference.
# - 'coherence' is a Model-Based Metric where a 'Judge' LLM rates the flow of the text.

# 4. Define Prompt Templates
# We use the Few-Shot strategy that proved stable in peer tests
prompt_templates = {
    "Zero-Shot": "Write a Twitter post for Aurora Bay about: {topic}",

    "Few-Shot": """
    You are the social media manager for Aurora Bay.
    Write a Twitter post about: {topic}

    EXAMPLE:
    Input: School closed due to snow
    Output: ALERT: All Aurora Bay schools are closed today due to snow. Stay safe! #AuroraBay

    Input: {topic}
    Output:
    """
}
# This satisfies Requirement #5
# I'm testing if giving an example (Few-Shot) actually improves performance over just asking (Zero-Shot).

results = {}

# 5. Run Evaluation
for strategy_name, template in prompt_templates.items():
    print(f"\nEvaluating Strategy: {strategy_name}...")

    eval_task = EvalTask(
        dataset=eval_dataset,
        metrics=metrics,
        experiment=f"aurora-prompt-eval-{strategy_name.lower()}"
    )

    # Passing the pre-configured model ensures settings are respected
    eval_result = eval_task.evaluate(
        model=EVAL_MODEL,
        prompt_template=template,
    )

    results[strategy_name] = eval_result.summary_metrics
    # Storing the results so I can print a nice comparison table at the end.

# 6. Display Results
print("\n" + "=" * 60)
print("PROMPT COMPARISON RESULTS")
print("=" * 60)
print(f"{'Metric':<15} {'Zero-Shot':<15} {'Few-Shot':<15}")
print("-" * 60)

def get_metric(stats, metric_name):
    return stats.get(f"{metric_name}/mean", 0.0)

for metric in metrics:
    score_zero = get_metric(results['Zero-Shot'], metric)
    score_few = get_metric(results['Few-Shot'], metric)

    winner_zero = "*" if score_zero > score_few else ""
    winner_few = "*" if score_few > score_zero else ""

    print(f"{metric:<15} {score_zero:.4f}{winner_zero:<10} {score_few:.4f}{winner_few}")

print("=" * 60)
print("Evaluation Complete.")

Setting up Evaluation Task (Gemini 2.5)...

Evaluating Strategy: Zero-Shot...


INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'prompt_template': 'Write a Twitter post for Aurora Bay about: {topic}', 'model_name': 'publishers/google/models/gemini-2.5-flash'}
INFO:vertexai.preview.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.preview.evaluation._pre_eval_utils:Generating a total of 3 responses from Gemini model gemini-2.5-flash.
100%|██████████| 3/3 [00:10<00:00,  3.37s/it]
INFO:vertexai.preview.evaluation._pre_eval_utils:All 3 responses are successfully generated from model.
INFO:vertexai.preview.evaluation._evaluation:Multithreaded Batch Inference took: 10.122205472000132 seconds.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:09<00:00,  1.53s/it]
INFO:vertexai.previ


Evaluating Strategy: Few-Shot...


INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'prompt_template': '\n    You are the social media manager for Aurora Bay.\n    Write a Twitter post about: {topic}\n    \n    EXAMPLE:\n    Input: School closed due to snow\n    Output: ALERT: All Aurora Bay schools are closed today due to snow. Stay safe! #AuroraBay\n    \n    Input: {topic}\n    Output:\n    ', 'model_name': 'publishers/google/models/gemini-2.5-flash'}
INFO:vertexai.preview.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.preview.evaluation._pre_eval_utils:Generating a total of 3 responses from Gemini model gemini-2.5-flash.
100%|██████████| 3/3 [00:03<00:00,  1.31s/it]
INFO:vertexai.preview.evaluation._pre_eval_utils:All 3 responses are successfully generated from model.
INFO:vertexai.preview.evaluation._evaluation:Multithreaded 


PROMPT COMPARISON RESULTS
Metric          Zero-Shot       Few-Shot       
------------------------------------------------------------
rouge_l         0.1194           0.5125*
coherence       5.0000           5.0000
Evaluation Complete.
