# Structured Outputs with Pydantic

Learn to get reliable JSON outputs from LLMs using Pydantic models.

In [None]:
from openai import OpenAI
from pydantic import BaseModel, Field, field_validator
from typing import Literal, List, Optional
from enum import Enum

# Initialize client with your API key
client = OpenAI(api_key="your-api-key-here")

## Problem: Unreliable Text Parsing

In [None]:
# Without structured outputs - FRAGILE!
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Classify sentiment as positive, negative, or neutral. Include confidence score."},
        {"role": "user", "content": "I love this product!"}
    ]
)

text_response = response.choices[0].message.content
print("Raw text response:")
print(text_response)
print("\n⚠️  How do you parse this reliably?")
print("   - Format can vary")
print("   - Extra words included")
print("   - Confidence format inconsistent")

## Solution: Structured Outputs

In [None]:
# Define schema with Pydantic
class SentimentAnalysis(BaseModel):
    label: Literal["positive", "negative", "neutral"]
    confidence: float = Field(ge=0, le=1, description="Confidence score between 0 and 1")
    reasoning: str = Field(description="Brief explanation")

# Get structured output
response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a sentiment classifier."},
        {"role": "user", "content": "I love this product!"}
    ],
    response_format=SentimentAnalysis
)

result = response.choices[0].message.parsed

print("Structured output:")
print(f"  Label: {result.label}")
print(f"  Confidence: {result.confidence}")
print(f"  Reasoning: {result.reasoning}")
print("\n✓ Type-safe, validated, ready to use!")

## Field Types and Validation

In [None]:
# Different field types
class ProductReview(BaseModel):
    # String with length constraints
    product_name: str = Field(min_length=1, max_length=100)

    # Integer with range
    rating: int = Field(ge=1, le=5, description="Rating from 1 to 5")

    # Float with validation
    price: float = Field(gt=0, description="Price in USD")

    # Boolean
    would_recommend: bool

    # Optional field
    comment: Optional[str] = Field(None, max_length=500)

    # List of strings
    pros: List[str] = Field(description="List of positive aspects")
    cons: List[str] = Field(description="List of negative aspects")

# Test it
review_text = """
Product: Wireless Mouse Pro
Review: This mouse is amazing! Great ergonomics and battery life lasts forever.
However, it's a bit pricey at $79.99. I'd still recommend it though.
Rating: 4/5
"""

response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Extract structured review data."},
        {"role": "user", "content": review_text}
    ],
    response_format=ProductReview
)

review = response.choices[0].message.parsed
print(f"Product: {review.product_name}")
print(f"Rating: {review.rating}/5")
print(f"Price: ${review.price}")
print(f"Would recommend: {review.would_recommend}")
print(f"Pros: {', '.join(review.pros)}")
print(f"Cons: {', '.join(review.cons)}")

## Enums for Fixed Categories

In [None]:
# Use Enum for categories
class TicketCategory(str, Enum):
    TECHNICAL = "technical"
    BILLING = "billing"
    ACCOUNT = "account"
    GENERAL = "general"

class Priority(str, Enum):
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    URGENT = "urgent"

class TicketClassification(BaseModel):
    category: TicketCategory
    priority: Priority
    requires_escalation: bool

# Classify ticket
ticket = "My account was charged twice. Need refund immediately."

response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Classify support tickets."},
        {"role": "user", "content": ticket}
    ],
    response_format=TicketClassification
)

result = response.choices[0].message.parsed
print(f"Category: {result.category.value}")
print(f"Priority: {result.priority.value}")
print(f"Escalate: {result.requires_escalation}")

## Custom Validators

In [None]:
import re

class ContactInfo(BaseModel):
    name: str = Field(min_length=1)
    email: str
    phone: str

    @field_validator('email')
    def validate_email(cls, v):
        """Validate email format"""
        if not re.match(r'^[\w\.-]+@[\w\.-]+\.\w+$', v):
            raise ValueError('Invalid email format')
        return v.lower()  # Normalize to lowercase

    @field_validator('phone')
    def validate_phone(cls, v):
        """Validate phone format"""
        # Remove common separators
        cleaned = re.sub(r'[\s\-\(\)]', '', v)
        if not cleaned.startswith('+'):
            cleaned = '+1' + cleaned  # Add US code if missing
        return cleaned

# Extract contact info
text = "Hi, I'm John Smith. Email me at JOHN@EXAMPLE.COM or call (555) 123-4567"

response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Extract contact information."},
        {"role": "user", "content": text}
    ],
    response_format=ContactInfo
)

contact = response.choices[0].message.parsed
print(f"Name: {contact.name}")
print(f"Email: {contact.email}")  # Lowercase
print(f"Phone: {contact.phone}")  # Normalized format

## Nested Models

In [None]:
# Nested structures
class Address(BaseModel):
    street: str
    city: str
    state: str
    zip_code: str

class OrderItem(BaseModel):
    product: str
    quantity: int = Field(ge=1)
    price: float = Field(gt=0)

class Order(BaseModel):
    order_id: str
    customer_name: str
    items: List[OrderItem]
    shipping_address: Address
    total: float = Field(gt=0)

# Extract order data
order_text = """
Order #12345 for Jane Doe
Items:
- 2x Widget ($29.99 each)
- 1x Gadget ($49.99)
Ship to: 123 Main St, Springfield, IL 62701
Total: $109.97
"""

response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Extract order information."},
        {"role": "user", "content": order_text}
    ],
    response_format=Order
)

order = response.choices[0].message.parsed
print(f"Order: {order.order_id}")
print(f"Customer: {order.customer_name}")
print(f"\nItems:")
for item in order.items:
    print(f"  {item.quantity}x {item.product} @ ${item.price}")
print(f"\nShip to: {order.shipping_address.street}, {order.shipping_address.city}")
print(f"Total: ${order.total}")

## Multiple Classifications

In [None]:
# Classify multiple texts at once
class Sentiment(BaseModel):
    text: str
    label: Literal["positive", "negative", "neutral"]
    confidence: float = Field(ge=0, le=1)

class BatchSentiment(BaseModel):
    results: List[Sentiment]

texts = [
    "I love this!",
    "This is terrible.",
    "It's okay, nothing special."
]

prompt = "Classify sentiment for each:\n" + "\n".join(
    f"{i+1}. {text}" for i, text in enumerate(texts)
)

response = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Classify sentiment."},
        {"role": "user", "content": prompt}
    ],
    response_format=BatchSentiment
)

results = response.choices[0].message.parsed
print("Batch results:")
for r in results.results:
    print(f"  '{r.text[:30]}...' → {r.label} ({r.confidence:.2f})")

## Error Handling

In [None]:
from pydantic import ValidationError

def safe_parse(text, schema):
    """Parse with error handling"""
    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "Extract structured data."},
                {"role": "user", "content": text}
            ],
            response_format=schema
        )
        return response.choices[0].message.parsed, None

    except ValidationError as e:
        return None, f"Validation error: {e}"

    except Exception as e:
        return None, f"API error: {e}"

# Test
result, error = safe_parse("Extract contact: John Doe", ContactInfo)
if error:
    print(f"Error: {error}")
else:
    print(f"Success: {result}")

## Production Pattern: Classifier Class

In [None]:
from functools import lru_cache

class StructuredClassifier:
    """Reusable classifier with caching"""

    def __init__(self, client, model="gpt-4o-mini"):
        self.client = client
        self.model = model

    @lru_cache(maxsize=1000)
    def classify(self, text: str, schema, system_prompt: str):
        """Classify with caching"""
        response = self.client.beta.chat.completions.parse(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ],
            response_format=schema
        )
        return response.choices[0].message.parsed

# Usage
classifier = StructuredClassifier(client)

sentiment = classifier.classify(
    "I love this!",
    SentimentAnalysis,
    "Classify sentiment"
)

print(f"Result: {sentiment.label} ({sentiment.confidence})")

## Summary

✅ Pydantic models define schemas  
✅ Field validation with constraints  
✅ Enums for fixed categories  
✅ Custom validators for complex validation  
✅ Nested models for complex structures  
✅ Batch processing multiple items  
✅ Error handling and caching

**Key benefits:**
- Type-safe outputs
- No parsing logic needed
- Automatic validation
- Database ready
- Production reliable

**Next:** Learn about function calling for agentic behaviors!