# Grok Structured Output Testing

Pyndatic AI structured output for grok doesnt work!
Grok's native parsing works well

In [24]:
import os
from openai import OpenAI
from pydantic import BaseModel, Field
from datetime import date
from enum import Enum
from typing import List

from dotenv import load_dotenv

# Load environment variables
load_dotenv('../envs/.env', override=True)

# Verify Grok API key is loaded
GROK_API_KEY = os.environ.get("GROK_API_KEY")
print(f"Grok API Key loaded: {GROK_API_KEY is not None}")

Grok API Key loaded: True


## Define Pydantic Schemas

Define the data models for invoice parsing.

In [2]:
class Currency(str, Enum):
    USD = "USD"
    EUR = "EUR"
    GBP = "GBP"
    RUB = "RUB"

class LineItem(BaseModel):
    description: str = Field(description="Description of the item or service")
    quantity: int = Field(description="Number of units", ge=1)
    unit_price: float = Field(description="Price per unit", ge=0)

class Address(BaseModel):
    street: str = Field(description="Street address")
    city: str = Field(description="City")
    postal_code: str = Field(description="Postal/ZIP code")
    country: str = Field(description="Country")

class Invoice(BaseModel):
    vendor_name: str = Field(description="Name of the vendor")
    vendor_address: Address = Field(description="Vendor's address")
    invoice_number: str = Field(description="Unique invoice identifier")
    invoice_date: date = Field(description="Date the invoice was issued")
    line_items: List[LineItem] = Field(description="List of purchased items/services")
    total_amount: float = Field(description="Total amount due", ge=0)
    currency: Currency = Field(description="Currency of the invoice")

print("‚úì Pydantic schemas defined")

‚úì Pydantic schemas defined


## Initialize Grok Client

In [3]:
client = OpenAI(
    api_key=GROK_API_KEY,
    base_url="https://api.x.ai/v1",
)

print("‚úì Grok client initialized")

‚úì Grok client initialized


## Try grok native output structure

In [14]:
complex_invoice_text = """
INVOICE

From: TechSupply Inc.
Address: 456 Tech Boulevard, Silicon Valley, CA 94025, USA

Invoice #: TECH-2025-999
Invoice Date: November 10, 2025

Line Items:
1. MacBook Pro 16" - Quantity: 3 - Unit Price: $2,499.00
2. Magic Mouse - Quantity: 3 - Unit Price: $79.00
3. USB-C Cable (2m) - Quantity: 10 - Unit Price: $19.99
4. Laptop Stand - Quantity: 3 - Unit Price: $49.99
5. External SSD 1TB - Quantity: 5 - Unit Price: $129.99

TOTAL: $8,383.82 USD
"""

start_time = time.time()

completion = client.beta.chat.completions.parse(
    model="grok-4-fast-non-reasoning",
    messages=[
        {"role": "system", "content": "Extract structured invoice data from the provided text. Be precise with numbers and dates."},
        {"role": "user", "content": complex_invoice_text}
    ],
    response_format=Invoice,
)

elapsed = time.time() - start_time

invoice = completion.choices[0].message.parsed
print(f"‚úì Parsed in {elapsed:.2f}s\n")
print(invoice.model_dump_json(indent=2))

‚úì Parsed in 4.50s

{
  "vendor_name": "TechSupply Inc.",
  "vendor_address": {
    "street": "456 Tech Boulevard",
    "city": "Silicon Valley",
    "postal_code": "94025",
    "country": "USA"
  },
  "invoice_number": "TECH-2025-999",
  "invoice_date": "2025-11-10",
  "line_items": [
    {
      "description": "MacBook Pro 16\"",
      "quantity": 3,
      "unit_price": 2499.0
    },
    {
      "description": "Magic Mouse",
      "quantity": 3,
      "unit_price": 79.0
    },
    {
      "description": "USB-C Cable (2m)",
      "quantity": 10,
      "unit_price": 19.99
    },
    {
      "description": "Laptop Stand",
      "quantity": 3,
      "unit_price": 49.99
    },
    {
      "description": "External SSD 1TB",
      "quantity": 5,
      "unit_price": 129.99
    }
  ],
  "total_amount": 8383.82,
  "currency": "USD"
}


## Test 2.1: Stress Test - 50 Runs with structured output Validation


In [15]:
import statistics
from typing import Dict, Any
from pydantic import ValidationError

complex_invoice_text = """
INVOICE

From: TechSupply Inc.
Address: 456 Tech Boulevard, Silicon Valley, CA 94025, USA

Invoice #: TECH-2025-999
Invoice Date: November 10, 2025

Line Items:
1. MacBook Pro 16" - Quantity: 3 - Unit Price: $2,499.00
2. Magic Mouse - Quantity: 3 - Unit Price: $79.00
3. USB-C Cable (2m) - Quantity: 10 - Unit Price: $19.99
4. Laptop Stand - Quantity: 3 - Unit Price: $49.99
5. External SSD 1TB - Quantity: 5 - Unit Price: $129.99

TOTAL: $8,383.82 USD
"""

def validate_invoice_structure(invoice: Invoice) -> Dict[str, Any]:
    """
    Validate that the invoice structure matches the expected schema.
    Returns a dictionary with validation results.
    """
    validation_results = {
        "valid": True,
        "errors": [],
        "checks": {}
    }
    
    try:
        # Check vendor_name
        validation_results["checks"]["vendor_name"] = isinstance(invoice.vendor_name, str) and len(invoice.vendor_name) > 0
        
        # Check vendor_address
        addr = invoice.vendor_address
        validation_results["checks"]["vendor_address"] = all([
            isinstance(addr.street, str),
            isinstance(addr.city, str),
            isinstance(addr.postal_code, str),
            isinstance(addr.country, str)
        ])
        
        # Check invoice_number
        validation_results["checks"]["invoice_number"] = isinstance(invoice.invoice_number, str) and len(invoice.invoice_number) > 0
        
        # Check invoice_date
        validation_results["checks"]["invoice_date"] = isinstance(invoice.invoice_date, date)
        
        # Check line_items
        validation_results["checks"]["line_items_count"] = len(invoice.line_items) > 0
        for i, item in enumerate(invoice.line_items):
            validation_results["checks"][f"line_item_{i}_structure"] = all([
                isinstance(item.description, str),
                isinstance(item.quantity, int) and item.quantity >= 1,
                isinstance(item.unit_price, float) and item.unit_price >= 0
            ])
        
        # Check total_amount
        validation_results["checks"]["total_amount"] = isinstance(invoice.total_amount, float) and invoice.total_amount >= 0
        
        # Check currency
        validation_results["checks"]["currency"] = isinstance(invoice.currency, Currency)
        
        # Overall validation
        validation_results["valid"] = all(validation_results["checks"].values())
        
    except Exception as e:
        validation_results["valid"] = False
        validation_results["errors"].append(str(e))
    
    return validation_results


def run_stress_test(num_runs: int = 50) -> Dict[str, Any]:
    """
    Run the invoice parsing multiple times and validate each result.
    """
    timings = []
    validation_results = []
    parsing_errors = []
    successful_runs = 0
    valid_structures = 0
    
    print(f"Starting stress test: {num_runs} runs...\n")
    
    for i in range(num_runs):
        try:
            start_time = time.time()
            
            completion = client.beta.chat.completions.parse(
                model="grok-4-fast-non-reasoning",
                messages=[
                    {"role": "system", "content": "Extract structured invoice data from the provided text. Be precise with numbers and dates."},
                    {"role": "user", "content": complex_invoice_text}
                ],
                response_format=Invoice,
            )
            
            elapsed = time.time() - start_time
            timings.append(elapsed)
            
            # Parse and validate
            invoice = completion.choices[0].message.parsed
            validation = validate_invoice_structure(invoice)
            validation_results.append(validation)
            
            successful_runs += 1
            if validation["valid"]:
                valid_structures += 1
            
            # Print progress every 10 runs
            if (i + 1) % 10 == 0:
                print(f"Progress: {i + 1}/{num_runs} - Last run: {elapsed:.2f}s - Valid: {validation['valid']}")
            
        except ValidationError as e:
            parsing_errors.append({
                "run": i + 1,
                "type": "ValidationError",
                "error": str(e)
            })
            print(f"‚úó Run {i + 1}: Validation Error - {e}")
            
        except Exception as e:
            parsing_errors.append({
                "run": i + 1,
                "type": "Exception",
                "error": str(e)
            })
            print(f"‚úó Run {i + 1}: Error - {e}")
    
    # Calculate statistics
    results = {
        "total_runs": num_runs,
        "successful_runs": successful_runs,
        "failed_runs": len(parsing_errors),
        "valid_structures": valid_structures,
        "invalid_structures": successful_runs - valid_structures,
        "success_rate": (successful_runs / num_runs) * 100,
        "validation_rate": (valid_structures / successful_runs * 100) if successful_runs > 0 else 0,
        "timings": {
            "avg": statistics.mean(timings) if timings else None,
            "min": min(timings) if timings else None,
            "max": max(timings) if timings else None,
            "median": statistics.median(timings) if timings else None,
            "std_dev": statistics.stdev(timings) if len(timings) > 1 else None,
        },
        "all_timings": timings,
        "parsing_errors": parsing_errors,
        "validation_details": validation_results
    }
    
    return results


# Run the stress test
print("="*60)
print("STRESS TEST: 50 RUNS WITH STRUCTURE VALIDATION")
print("="*60 + "\n")

stress_results = run_stress_test(num_runs=50)

# Print summary
print("\n" + "="*60)
print("STRESS TEST RESULTS")
print("="*60)
print(f"\nüìä Execution Statistics:")
print(f"  Total runs: {stress_results['total_runs']}")
print(f"  Successful runs: {stress_results['successful_runs']}")
print(f"  Failed runs: {stress_results['failed_runs']}")
print(f"  Success rate: {stress_results['success_rate']:.2f}%")

print(f"\n‚úÖ Structure Validation:")
print(f"  Valid structures: {stress_results['valid_structures']}")
print(f"  Invalid structures: {stress_results['invalid_structures']}")
print(f"  Validation rate: {stress_results['validation_rate']:.2f}%")

if stress_results['timings']['avg']:
    print(f"\n‚è±Ô∏è  Performance:")
    print(f"  Average time: {stress_results['timings']['avg']:.2f}s")
    print(f"  Median time: {stress_results['timings']['median']:.2f}s")
    print(f"  Min time: {stress_results['timings']['min']:.2f}s")
    print(f"  Max time: {stress_results['timings']['max']:.2f}s")
    if stress_results['timings']['std_dev']:
        print(f"  Std deviation: {stress_results['timings']['std_dev']:.2f}s")

if stress_results['parsing_errors']:
    print(f"\n‚ùå Errors ({len(stress_results['parsing_errors'])}):")
    for err in stress_results['parsing_errors'][:5]:  # Show first 5 errors
        print(f"  Run {err['run']}: {err['type']} - {err['error'][:100]}")
    if len(stress_results['parsing_errors']) > 5:
        print(f"  ... and {len(stress_results['parsing_errors']) - 5} more errors")

# Check for any validation failures
invalid_validations = [v for v in stress_results['validation_details'] if not v['valid']]
if invalid_validations:
    print(f"\n‚ö†Ô∏è  Invalid Structure Details:")
    for i, v in enumerate(invalid_validations[:3], 1):  # Show first 3
        print(f"\n  Invalid result #{i}:")
        print(f"    Failed checks: {[k for k, val in v['checks'].items() if not val]}")
        if v['errors']:
            print(f"    Errors: {v['errors']}")

print("\n" + "="*60)
print(f"‚úÖ VALIDATION COMPLETE: {stress_results['validation_rate']:.1f}% structures valid")
print("="*60)

STRESS TEST: 50 RUNS WITH STRUCTURE VALIDATION

Starting stress test: 50 runs...

Progress: 10/50 - Last run: 0.61s - Valid: True
Progress: 10/50 - Last run: 0.61s - Valid: True
Progress: 20/50 - Last run: 0.77s - Valid: True
Progress: 20/50 - Last run: 0.77s - Valid: True
Progress: 30/50 - Last run: 0.72s - Valid: True
Progress: 30/50 - Last run: 0.72s - Valid: True
Progress: 40/50 - Last run: 0.73s - Valid: True
Progress: 40/50 - Last run: 0.73s - Valid: True
Progress: 50/50 - Last run: 0.75s - Valid: True

STRESS TEST RESULTS

üìä Execution Statistics:
  Total runs: 50
  Successful runs: 50
  Failed runs: 0
  Success rate: 100.00%

‚úÖ Structure Validation:
  Valid structures: 50
  Invalid structures: 0
  Validation rate: 100.00%

‚è±Ô∏è  Performance:
  Average time: 0.81s
  Median time: 0.71s
  Min time: 0.61s
  Max time: 3.43s
  Std deviation: 0.45s

‚úÖ VALIDATION COMPLETE: 100.0% structures valid
Progress: 50/50 - Last run: 0.75s - Valid: True

STRESS TEST RESULTS

üìä Executi

## Trying Native Output in pydantic ai 

In [23]:
import asyncio
import time
import os
from dataclasses import dataclass
from typing import Literal, Union
import httpx
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai import Agent, NativeOutput

@dataclass
class TestDeps:
    """Dependencies for the test agent"""
    query: str


class QuizItemAnswer(BaseModel):
    answer: str = Field(description="The answer text")
    explanation: str = Field(description="The explanation text")
    correct: bool = Field(description="Whether the answer is correct")


class QuizItemTest(BaseModel):
    question: str = Field(description="The quiz question text")
    answers: list[QuizItemAnswer] = Field(
        description="4 answers with only one correct",
        min_length=4,
        max_length=4
    )


class TestOutput(BaseModel):
    mode: Literal["quiz"] = "quiz"
    quiz_items: list[QuizItemTest] = Field(
        description="Array of quiz items",
        min_length=5,
        max_length=5
    )


def validate_quiz_structure(quiz: TestOutput) -> Dict[str, Any]:
    """
    Validate that the quiz structure matches the expected schema.
    Returns a dictionary with validation results.
    """
    validation_results = {
        "valid": True,
        "errors": [],
        "checks": {}
    }
    
    try:
        # Check mode
        validation_results["checks"]["mode"] = quiz.mode == "quiz"
        
        # Check quiz_items count
        validation_results["checks"]["quiz_items_count"] = len(quiz.quiz_items) == 5
        
        # Check each quiz item
        for i, item in enumerate(quiz.quiz_items):
            # Check question
            validation_results["checks"][f"item_{i}_question"] = isinstance(item.question, str) and len(item.question) > 0
            
            # Check answers count
            validation_results["checks"][f"item_{i}_answers_count"] = len(item.answers) == 4
            
            # Check that exactly one answer is correct
            correct_count = sum(1 for ans in item.answers if ans.correct)
            validation_results["checks"][f"item_{i}_one_correct"] = correct_count == 1
            
            # Check each answer
            for j, answer in enumerate(item.answers):
                validation_results["checks"][f"item_{i}_answer_{j}_text"] = isinstance(answer.answer, str) and len(answer.answer) > 0
                validation_results["checks"][f"item_{i}_answer_{j}_explanation"] = isinstance(answer.explanation, str) and len(answer.explanation) > 0
                validation_results["checks"][f"item_{i}_answer_{j}_correct"] = isinstance(answer.correct, bool)
        
        # Overall validation
        validation_results["valid"] = all(validation_results["checks"].values())
        
    except Exception as e:
        validation_results["valid"] = False
        validation_results["errors"].append(str(e))
    
    return validation_results


async def run_quiz_stress_test(
    model: Union[str, OpenAIChatModel] = "grok-beta",
    query: str = "Python programming basics",
    num_runs: int = 50
) -> Dict[str, Any]:
    """
    Run the quiz generation multiple times and validate each result.
    
    Args:
        model: The model to use (default: "grok-beta")
        query: The topic for quiz generation
        num_runs: Number of times to run the test
    
    Returns:
        Dictionary with comprehensive test results
    """
    from dotenv import load_dotenv
    load_dotenv('../envs/.env', override=True)

    # Verify keys are loaded
    TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
    GROK_API_KEY = os.environ.get("GROK_API_KEY")

    grok_model = model
    
    agent = Agent(
        model=grok_model,
        output_type=NativeOutput(TestOutput),
        deps_type=TestDeps,
    )
    
    timings = []
    validation_results = []
    parsing_errors = []
    successful_runs = 0
    valid_structures = 0
    
    model_name = model if isinstance(model, str) else model.model_name
    
    print(f"Starting quiz stress test: {num_runs} runs with model {model_name}...\n")
    
    for i in range(num_runs):
        try:
            start_time = time.time()
            
            result = await agent.run(
                f"Generate 5 quiz questions about: {query}",
                deps=TestDeps(query=query)
            )
            
            elapsed = time.time() - start_time
            timings.append(elapsed)
            
            # Get output
            output = result.output if hasattr(result, 'output') else result.data
            
            # Validate structure
            validation = validate_quiz_structure(output)
            validation_results.append(validation)
            
            successful_runs += 1
            if validation["valid"]:
                valid_structures += 1
            
            # Print progress every 10 runs
            if (i + 1) % 10 == 0:
                print(f"Progress: {i + 1}/{num_runs} - Last run: {elapsed:.2f}s - Valid: {validation['valid']}")
            
        except ValidationError as e:
            parsing_errors.append({
                "run": i + 1,
                "type": "ValidationError",
                "error": str(e)
            })
            print(f"‚úó Run {i + 1}: Validation Error - {str(e)[:100]}")
            
        except Exception as e:
            parsing_errors.append({
                "run": i + 1,
                "type": "Exception",
                "error": str(e)
            })
            print(f"‚úó Run {i + 1}: Error - {str(e)[:100]}")
    
    # Calculate statistics
    results = {
        "model": model_name,
        "query": query,
        "total_runs": num_runs,
        "successful_runs": successful_runs,
        "failed_runs": len(parsing_errors),
        "valid_structures": valid_structures,
        "invalid_structures": successful_runs - valid_structures,
        "success_rate": (successful_runs / num_runs) * 100,
        "validation_rate": (valid_structures / successful_runs * 100) if successful_runs > 0 else 0,
        "timings": {
            "avg": statistics.mean(timings) if timings else None,
            "min": min(timings) if timings else None,
            "max": max(timings) if timings else None,
            "median": statistics.median(timings) if timings else None,
            "std_dev": statistics.stdev(timings) if len(timings) > 1 else None,
        },
        "all_timings": timings,
        "parsing_errors": parsing_errors,
        "validation_details": validation_results
    }
    
    return results


# Run the stress test
print("="*60)
print("QUIZ GENERATION STRESS TEST: 50 RUNS WITH VALIDATION")
print("="*60 + "\n")

quiz_stress_results = await run_quiz_stress_test(
    model="grok:grok-4-fast-non-reasoning",
    query="Python programming basics",
    num_runs=50
)

# Print summary
print("\n" + "="*60)
print("QUIZ STRESS TEST RESULTS")
print("="*60)
print(f"\nüìä Execution Statistics:")
print(f"  Model: {quiz_stress_results['model']}")
print(f"  Query: {quiz_stress_results['query']}")
print(f"  Total runs: {quiz_stress_results['total_runs']}")
print(f"  Successful runs: {quiz_stress_results['successful_runs']}")
print(f"  Failed runs: {quiz_stress_results['failed_runs']}")
print(f"  Success rate: {quiz_stress_results['success_rate']:.2f}%")

print(f"\n‚úÖ Structure Validation:")
print(f"  Valid structures: {quiz_stress_results['valid_structures']}")
print(f"  Invalid structures: {quiz_stress_results['invalid_structures']}")
print(f"  Validation rate: {quiz_stress_results['validation_rate']:.2f}%")

if quiz_stress_results['timings']['avg']:
    print(f"\n‚è±Ô∏è  Performance:")
    print(f"  Average time: {quiz_stress_results['timings']['avg']:.2f}s")
    print(f"  Median time: {quiz_stress_results['timings']['median']:.2f}s")
    print(f"  Min time: {quiz_stress_results['timings']['min']:.2f}s")
    print(f"  Max time: {quiz_stress_results['timings']['max']:.2f}s")
    if quiz_stress_results['timings']['std_dev']:
        print(f"  Std deviation: {quiz_stress_results['timings']['std_dev']:.2f}s")

if quiz_stress_results['parsing_errors']:
    print(f"\n‚ùå Errors ({len(quiz_stress_results['parsing_errors'])}):")
    for err in quiz_stress_results['parsing_errors'][:5]:  # Show first 5 errors
        print(f"  Run {err['run']}: {err['type']} - {err['error'][:100]}")
    if len(quiz_stress_results['parsing_errors']) > 5:
        print(f"  ... and {len(quiz_stress_results['parsing_errors']) - 5} more errors")

# Check for any validation failures
quiz_invalid_validations = [v for v in quiz_stress_results['validation_details'] if not v['valid']]
if quiz_invalid_validations:
    print(f"\n‚ö†Ô∏è  Invalid Structure Details:")
    for i, v in enumerate(quiz_invalid_validations[:3], 1):  # Show first 3
        print(f"\n  Invalid result #{i}:")
        print(f"    Failed checks: {[k for k, val in v['checks'].items() if not val]}")
        if v['errors']:
            print(f"    Errors: {v['errors']}")

print("\n" + "="*60)
print(f"‚úÖ VALIDATION COMPLETE: {quiz_stress_results['validation_rate']:.1f}% structures valid")
print("="*60)

QUIZ GENERATION STRESS TEST: 50 RUNS WITH VALIDATION

Starting quiz stress test: 50 runs with model grok:grok-4-fast-non-reasoning...

‚úó Run 1: Error - Native structured output is not supported by this model.
‚úó Run 2: Error - Native structured output is not supported by this model.
‚úó Run 3: Error - Native structured output is not supported by this model.
‚úó Run 4: Error - Native structured output is not supported by this model.
‚úó Run 5: Error - Native structured output is not supported by this model.
‚úó Run 6: Error - Native structured output is not supported by this model.
‚úó Run 7: Error - Native structured output is not supported by this model.
‚úó Run 8: Error - Native structured output is not supported by this model.
‚úó Run 9: Error - Native structured output is not supported by this model.
‚úó Run 10: Error - Native structured output is not supported by this model.
‚úó Run 11: Error - Native structured output is not supported by this model.
‚úó Run 12: Error - Native 