In [3]:
"""
Challenge Three: Testing and Evaluation
Testing LLM Functions and Using Evaluation API
"""

import vertexai
import pandas as pd
from google.cloud import bigquery
from vertexai.generative_models import GenerativeModel
from google.cloud.aiplatform_v1 import EvaluationServiceClient
from google.cloud.aiplatform_v1.types import evaluation_service
import google.auth
import os
import unittest
import pytest
from datetime import datetime

# ============================================================================
# Configuration
# ============================================================================
PROJECT_ID = "qwiklabs-gcp-00-cc0593714b16"
LOCATION = "us-central1"
GENERATIVE_MODEL = "gemini-2.5-flash"

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

print("üöÄ Challenge Three: Testing and Evaluation - FIXED VERSION")
print(f"üìç Project: {PROJECT_ID}")
print(f"üìç Location: {LOCATION}")
print(f"ü§ñ Model: {GENERATIVE_MODEL}")

# Initialize the model
model = GenerativeModel(GENERATIVE_MODEL)

# ============================================================================
# Task 1: Classification Function - IMPROVED VERSION
# ============================================================================

def classify_question(prompt):
    """
    Classify user questions into categories:
    - Employment
    - General Information
    - Emergency Services
    - Tax Related

    IMPROVED: Better distinction between Emergency Services and General Information
    """
    response = model.generate_content(
        """Context: You classify user questions into one of four categories for Aurora Bay town services.

        CATEGORY DEFINITIONS:
        - Employment: Questions about jobs, hiring, employment applications, careers, work opportunities
        - General Information: General town info, locations, hours, services, community events (NON-EMERGENCY)
        - Emergency Services: Police, fire, medical emergencies, safety issues, urgent public safety matters
        - Tax Related: Taxes, bill payments, fees, financial obligations, utility payments

        IMPORTANT: Location questions should be classified as:
          - Emergency Services if asking about police/fire/emergency locations
          - General Information if asking about non-emergency locations (library, town hall, etc.)

        Output ONLY the category name: Employment, General Information, Emergency Services, or Tax Related

        Question: {0}
        Category: """.format(prompt)
    )
    return response.text.strip()

# ============================================================================
# Task 2: Social Media Post Generator
# ============================================================================

def generate_social_media_post(prompt):
    """
    Generate social media posts for government announcements
    """
    response = model.generate_content(
        """Context: You write social media posts for Aurora Bay government announcements.

        Rules:
        1. Keep posts under 280 characters
        2. Include relevant hashtags (e.g., #AuroraBay, #AKGov, #PublicSafety)
        3. Be clear and informative
        4. Use appropriate tone for the announcement type
        5. Include call to action when relevant

        Examples:
        Input: "Weather emergency - heavy snow expected tonight"
        Output: "‚ùÑÔ∏è WEATHER ALERT: Heavy snow expected tonight in Aurora Bay. Please stay off roads if possible. Stock up on essentials. #AuroraBay #AKWeather #PublicSafety"

        Input: "Town hall closed for Memorial Day"
        Output: "üèõÔ∏è REMINDER: Aurora Bay Town Hall will be closed Monday for Memorial Day. Normal hours resume Tuesday. #AuroraBay #MemorialDay #AKGov"

        Input: "School closing due to power outage"
        Output: "‚ö†Ô∏è SCHOOL CLOSING: All Aurora Bay schools closed today due to power outage. Updates will be posted here. #AuroraBaySchools #AKEd"

        Input: {0}
        Output: """.format(prompt)
    )
    return response.text.strip()

# ============================================================================
# Task 3: Unit Tests - FIXED VERSION
# ============================================================================

class TestClassificationFunction(unittest.TestCase):
    """Unit tests for the classification function - FIXED VERSION"""

    def test_employment_classification(self):
        """Test employment-related questions"""
        response = classify_question("How do I apply for a job with the town?")
        self.assertEqual(response, "Employment")

        response = classify_question("What are the career opportunities in Aurora Bay?")
        self.assertEqual(response, "Employment")

    def test_emergency_services_classification(self):
        """Test emergency services questions - FIXED: More specific examples"""
        # Direct emergency service questions
        response = classify_question("How do I report a fire emergency?")
        self.assertEqual(response, "Emergency Services")

        # Location questions for emergency services
        response = classify_question("Where is the police station located?")
        self.assertEqual(response, "Emergency Services")

        # Emergency contact questions
        response = classify_question("What's the number for emergency services?")
        self.assertEqual(response, "Emergency Services")

    def test_tax_related_classification(self):
        """Test tax-related questions"""
        response = classify_question("When are property taxes due?")
        self.assertEqual(response, "Tax Related")

        response = classify_question("How do I pay my water bill?")
        self.assertEqual(response, "Tax Related")

    def test_general_information_classification(self):
        """Test general information questions"""
        response = classify_question("What are the town hall hours?")
        self.assertEqual(response, "General Information")

        response = classify_question("Where is the public library located?")
        self.assertEqual(response, "General Information")

        # Test distinction: non-emergency location vs emergency location
        response = classify_question("When does the community center open?")
        self.assertEqual(response, "General Information")

class TestSocialMediaFunction(unittest.TestCase):
    """Unit tests for the social media post generator"""

    def check_post_rules(self, post):
        """Helper function to check if post follows rules"""
        # Check length
        if len(post) > 280:
            return False, f"Post too long: {len(post)} characters"

        # Check for hashtags
        if '#' not in post:
            return False, "No hashtags found"

        # Check for Aurora Bay mention
        if 'Aurora Bay' not in post and 'AuroraBay' not in post:
            return False, "No Aurora Bay reference"

        return True, "All rules followed"

    def test_weather_emergency_post(self):
        """Test weather emergency post generation"""
        post = generate_social_media_post("Weather emergency - heavy snow expected tonight")
        is_valid, message = self.check_post_rules(post)
        self.assertTrue(is_valid, message)
        print(f"‚úÖ Weather Post: {post}")

    def test_holiday_closure_post(self):
        """Test holiday closure post generation"""
        post = generate_social_media_post("Town hall closed for Memorial Day")
        is_valid, message = self.check_post_rules(post)
        self.assertTrue(is_valid, message)
        print(f"‚úÖ Holiday Post: {post}")

    def test_school_closing_post(self):
        """Test school closing post generation"""
        post = generate_social_media_post("School closing due to power outage")
        is_valid, message = self.check_post_rules(post)
        self.assertTrue(is_valid, message)
        print(f"‚úÖ School Post: {post}")

# ============================================================================
# Task 4: Evaluation API Implementation - IMPROVED VERSION
# ============================================================================

def evaluate_classification_performance():
    """Evaluate classification function using different prompts"""
    print("\nüìä EVALUATING CLASSIFICATION PERFORMANCE")
    print("=" * 60)

    # Test questions for evaluation - IMPROVED: Clearer distinctions
    test_questions = [
        {"question": "How do I apply for a police officer position?", "expected": "Employment"},
        {"question": "Where is the fire department located for emergencies?", "expected": "Emergency Services"},
        {"question": "When are property taxes due?", "expected": "Tax Related"},
        {"question": "What are the library hours?", "expected": "General Information"},
        {"question": "I need to pay my water bill online", "expected": "Tax Related"},
        {"question": "Job opportunities in town government", "expected": "Employment"},
        {"question": "How to report a power outage emergency", "expected": "Emergency Services"},
        {"question": "Town hall contact information for general inquiries", "expected": "General Information"},
        {"question": "Where is the nearest police station?", "expected": "Emergency Services"},
        {"question": "When does the park open for public use?", "expected": "General Information"}
    ]

    results = []

    for test in test_questions:
        actual = classify_question(test["question"])
        is_correct = actual == test["expected"]
        results.append({
            "question": test["question"],
            "expected": test["expected"],
            "actual": actual,
            "correct": is_correct
        })

        status = "‚úÖ" if is_correct else "‚ùå"
        print(f"{status} Q: {test['question']}")
        print(f"   Expected: {test['expected']}, Got: {actual}")

    # Calculate accuracy
    accuracy = sum(1 for r in results if r["correct"]) / len(results)
    print(f"\nüìà Classification Accuracy: {accuracy:.1%} ({sum(1 for r in results if r['correct'])}/{len(results)})")

    return results, accuracy

def evaluate_social_media_performance():
    """Evaluate social media post generator"""
    print("\nüìä EVALUATING SOCIAL MEDIA POSTS")
    print("=" * 60)

    test_announcements = [
        "Weather emergency - heavy snow expected tonight",
        "Town hall closed for Memorial Day",
        "School closing due to power outage",
        "New park opening ceremony this Saturday",
        "Boil water notice for downtown area"
    ]

    results = []

    for announcement in test_announcements:
        post = generate_social_media_post(announcement)
        length_ok = len(post) <= 280
        has_hashtags = '#' in post
        mentions_town = 'Aurora Bay' in post or 'AuroraBay' in post

        is_valid = length_ok and has_hashtags and mentions_town

        results.append({
            "announcement": announcement,
            "post": post,
            "length_ok": length_ok,
            "has_hashtags": has_hashtags,
            "mentions_town": mentions_town,
            "valid": is_valid
        })

        status = "‚úÖ" if is_valid else "‚ùå"
        print(f"{status} Announcement: {announcement}")
        print(f"   Post: {post}")
        print(f"   Length: {len(post)} chars, Hashtags: {has_hashtags}, Mentions Town: {mentions_town}")

    # Calculate validity rate
    validity_rate = sum(1 for r in results if r["valid"]) / len(results)
    print(f"\nüìà Social Media Post Validity: {validity_rate:.1%} ({sum(1 for r in results if r['valid'])}/{len(results)})")

    return results, validity_rate

# ============================================================================
# Alternative Prompt Comparison - IMPROVED VERSION
# ============================================================================

def classify_question_alternative(prompt):
    """Alternative classification prompt for comparison - IMPROVED"""
    response = model.generate_content(
        """CATEGORIZE THIS QUESTION FOR AURORA BAY TOWN SERVICES:

        CATEGORIES:
        [Employment] = jobs, hiring, work applications, careers, employment opportunities
        [General Information] = locations (NON-EMERGENCY), hours, services, community info
        [Emergency Services] = police, fire, medical emergencies, safety issues, emergency locations
        [Tax Related] = taxes, bills, payments, fees, financial obligations

        KEY DISTINCTION:
        - Police/Fire station locations = Emergency Services
        - Library/Town Hall locations = General Information
        - Emergency contacts = Emergency Services
        - General contacts = General Information

        Return ONLY the category name from: Employment, General Information, Emergency Services, Tax Related

        Question: {0}
        Category:""".format(prompt)
    )
    return response.text.strip()

def compare_prompt_effectiveness():
    """Compare original vs alternative prompt effectiveness"""
    print("\nüîç COMPARING PROMPT EFFECTIVENESS")
    print("=" * 60)

    comparison_questions = [
        "How do I apply for a job with the town?",
        "Where is the police station for emergencies?",
        "When are property taxes due?",
        "What are the town hall hours?",
        "How do I report a fire emergency?"
    ]

    expected_answers = ["Employment", "Emergency Services", "Tax Related", "General Information", "Emergency Services"]

    original_correct = 0
    alternative_correct = 0

    for i, question in enumerate(comparison_questions):
        original_result = classify_question(question)
        alternative_result = classify_question_alternative(question)
        expected = expected_answers[i]

        original_match = original_result == expected
        alternative_match = alternative_result == expected

        if original_match:
            original_correct += 1
        if alternative_match:
            alternative_correct += 1

        print(f"Q: {question}")
        print(f"  Expected: {expected}")
        print(f"  Original: {original_result} {'‚úÖ' if original_match else '‚ùå'}")
        print(f"  Alternative: {alternative_result} {'‚úÖ' if alternative_match else '‚ùå'}")
        print()

    original_accuracy = original_correct / len(comparison_questions)
    alternative_accuracy = alternative_correct / len(comparison_questions)

    print(f"üìä ORIGINAL PROMPT ACCURACY: {original_accuracy:.1%}")
    print(f"üìä ALTERNATIVE PROMPT ACCURACY: {alternative_accuracy:.1%}")

    if original_accuracy > alternative_accuracy:
        print("üéØ BEST: Original Prompt")
    elif alternative_accuracy > original_accuracy:
        print("üéØ BEST: Alternative Prompt")
    else:
        print("üéØ TIE: Both prompts perform equally")

    return original_accuracy, alternative_accuracy

# ============================================================================
# Main Execution
# ============================================================================

def main():
    """Run all tests and evaluations"""
    print("üöÄ CHALLENGE THREE: TESTING AND EVALUATION - FIXED VERSION")
    print("=" * 60)

    # Test classification function
    print("\nüß™ TESTING CLASSIFICATION FUNCTION")
    print("=" * 40)

    test_questions = [
        "How do I apply for a job with the town?",
        "Where is the police station for emergencies?",
        "When are property taxes due?",
        "What are the town hall hours?",
        "How do I report a fire emergency?"
    ]

    for question in test_questions:
        category = classify_question(question)
        print(f"Q: {question}")
        print(f"A: {category}\n")

    # Test social media function
    print("\nüß™ TESTING SOCIAL MEDIA FUNCTION")
    print("=" * 40)

    test_announcements = [
        "Weather emergency - heavy snow expected tonight",
        "Town hall closed for Memorial Day"
    ]

    for announcement in test_announcements:
        post = generate_social_media_post(announcement)
        print(f"Announcement: {announcement}")
        print(f"Post: {post}")
        print(f"Length: {len(post)} characters\n")

    # Run evaluations
    classification_results, class_accuracy = evaluate_classification_performance()
    social_results, social_accuracy = evaluate_social_media_performance()

    # Compare prompts
    original_acc, alternative_acc = compare_prompt_effectiveness()

    # Summary
    print("\nüéØ EVALUATION SUMMARY")
    print("=" * 60)
    print(f"üìä Classification Accuracy: {class_accuracy:.1%}")
    print(f"üìä Social Media Post Validity: {social_accuracy:.1%}")
    print(f"üìä Original Prompt Accuracy: {original_acc:.1%}")
    print(f"üìä Alternative Prompt Accuracy: {alternative_acc:.1%}")
    print(f"ü§ñ Model Used: {GENERATIVE_MODEL}")
    print(f"üìç Project: {PROJECT_ID}")

    print("\n‚úÖ CHALLENGE THREE COMPLETED SUCCESSFULLY!")

# ============================================================================
# Run Unit Tests with pytest - FIXED VERSION
# ============================================================================

def run_unit_tests():
    """Run unit tests using pytest - FIXED VERSION"""
    print("\nüî¨ RUNNING UNIT TESTS")
    print("=" * 60)

    # Create test suite
    suite = unittest.TestSuite()

    # Add tests - FIXED: Using more specific emergency services examples
    suite.addTest(TestClassificationFunction('test_employment_classification'))
    suite.addTest(TestClassificationFunction('test_emergency_services_classification'))
    suite.addTest(TestClassificationFunction('test_tax_related_classification'))
    suite.addTest(TestClassificationFunction('test_general_information_classification'))

    suite.addTest(TestSocialMediaFunction('test_weather_emergency_post'))
    suite.addTest(TestSocialMediaFunction('test_holiday_closure_post'))
    suite.addTest(TestSocialMediaFunction('test_school_closing_post'))

    # Run tests
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)

    print(f"\nüìä TEST RESULTS: {result.testsRun} tests run")
    print(f"‚úÖ Passed: {result.testsRun - len(result.failures) - len(result.errors)}")
    print(f"‚ùå Failed: {len(result.failures)}")
    print(f"‚ö†Ô∏è  Errors: {len(result.errors)}")

    # Print detailed failure information if any
    if result.failures:
        print(f"\nüîç FAILURE DETAILS:")
        for test, traceback in result.failures:
            print(f"   {test}: {traceback.splitlines()[-1]}")

    return result

# Execute the main function
if __name__ == "__main__":
    # Run main evaluation
    main()

    # Run unit tests
    result = run_unit_tests()

    # Final status
    if result.failures or result.errors:
        print(f"\n‚ùå SOME TESTS FAILED: {len(result.failures)} failures, {len(result.errors)} errors")
    else:
        print(f"\nüéâ ALL TESTS PASSED SUCCESSFULLY!")


üöÄ Challenge Three: Testing and Evaluation - FIXED VERSION
üìç Project: qwiklabs-gcp-00-cc0593714b16
üìç Location: us-central1
ü§ñ Model: gemini-2.5-flash
üöÄ CHALLENGE THREE: TESTING AND EVALUATION - FIXED VERSION

üß™ TESTING CLASSIFICATION FUNCTION




Q: How do I apply for a job with the town?
A: Employment

Q: Where is the police station for emergencies?
A: Emergency Services

Q: When are property taxes due?
A: Tax Related

Q: What are the town hall hours?
A: General Information

Q: How do I report a fire emergency?
A: Emergency Services


üß™ TESTING SOCIAL MEDIA FUNCTION
Announcement: Weather emergency - heavy snow expected tonight
Post: ‚ùÑÔ∏è WEATHER ALERT: Heavy snow expected tonight in Aurora Bay. Please stay off roads if possible. Stock up on essentials. #AuroraBay #AKWeather #PublicSafety
Length: 155 characters

Announcement: Town hall closed for Memorial Day
Post: üèõÔ∏è REMINDER: Aurora Bay Town Hall will be closed Monday for Memorial Day. Normal hours resume Tuesday. #AuroraBay #MemorialDay #AKGov
Length: 133 characters


üìä EVALUATING CLASSIFICATION PERFORMANCE
‚úÖ Q: How do I apply for a police officer position?
   Expected: Employment, Got: Employment
‚úÖ Q: Where is the fire department located for emergencies?
  

test_employment_classification (__main__.TestClassificationFunction.test_employment_classification)
Test employment-related questions ... 

Q: How do I report a fire emergency?
  Expected: Emergency Services
  Original: Emergency Services ‚úÖ
  Alternative: Emergency Services ‚úÖ

üìä ORIGINAL PROMPT ACCURACY: 100.0%
üìä ALTERNATIVE PROMPT ACCURACY: 100.0%
üéØ TIE: Both prompts perform equally

üéØ EVALUATION SUMMARY
üìä Classification Accuracy: 100.0%
üìä Social Media Post Validity: 100.0%
üìä Original Prompt Accuracy: 100.0%
üìä Alternative Prompt Accuracy: 100.0%
ü§ñ Model Used: gemini-2.5-flash
üìç Project: qwiklabs-gcp-00-cc0593714b16

‚úÖ CHALLENGE THREE COMPLETED SUCCESSFULLY!

üî¨ RUNNING UNIT TESTS


ok
test_emergency_services_classification (__main__.TestClassificationFunction.test_emergency_services_classification)
Test emergency services questions - FIXED: More specific examples ... ok
test_tax_related_classification (__main__.TestClassificationFunction.test_tax_related_classification)
Test tax-related questions ... ok
test_general_information_classification (__main__.TestClassificationFunction.test_general_information_classification)
Test general information questions ... ok
test_weather_emergency_post (__main__.TestSocialMediaFunction.test_weather_emergency_post)
Test weather emergency post generation ... ok
test_holiday_closure_post (__main__.TestSocialMediaFunction.test_holiday_closure_post)
Test holiday closure post generation ... 

‚úÖ Weather Post: ‚ùÑÔ∏è WEATHER ALERT: Heavy snow expected tonight in Aurora Bay. Prepare for difficult travel conditions & potential power outages. Stay safe! #AuroraBay #AKWeather #PublicSafety


ok
test_school_closing_post (__main__.TestSocialMediaFunction.test_school_closing_post)
Test school closing post generation ... 

‚úÖ Holiday Post: üèõÔ∏è REMINDER: Aurora Bay Town Hall will be closed Monday for Memorial Day. Normal hours resume Tuesday. #AuroraBay #MemorialDay #AKGov


ok

----------------------------------------------------------------------
Ran 7 tests in 12.754s

OK


‚úÖ School Post: ‚ö†Ô∏è SCHOOL CLOSING: All Aurora Bay schools closed today due to power outage. Updates will be posted here. #AuroraBaySchools #AKEd

üìä TEST RESULTS: 7 tests run
‚úÖ Passed: 7
‚ùå Failed: 0
‚ö†Ô∏è  Errors: 0

üéâ ALL TESTS PASSED SUCCESSFULLY!
