# Validation

Validation: Ensures LLM outputs match predefined data schemas.
This component provides schema validation and structured data parsing to guarantee consistent data formats for downstream code.

## Groq structured output

In [5]:
from groq import Groq
from pydantic import BaseModel, Field
from typing import Literal, Optional, List
import json
from dotenv import load_dotenv
import os
from enum import Enum

load_dotenv()

True

In [17]:
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
MODEL = "moonshotai/kimi-k2-instruct"

class ProductReview(BaseModel):
    product_name: str
    rating: float
    sentiment: Literal["positive", "neutral", "negative"]
    key_features: list[str]
    
response = client.chat.completions.create (
    messages = [
        {
            "role": "system",
            "content": "Extract product review information from the text."
        },
        {
            "role": "user",
            "content": "I bought the UltraSound Headphones last week. The noise cancellation is worst and the battery lasts all day. Sound quality is crisp and clear. I'd give it a rating of 4.0 out of 5."
        }
    ],
    model = MODEL,
    tools= [{
        "type": "function",
        "function": {
            "name": "product_review",
            "parameters": ProductReview.model_json_schema()
        }
    }],
    tool_choice="auto"
    # response_format=ProductReview.model_json_schema()
)
tool_call = response.choices[0].message.tool_calls[0]
args = json.loads(tool_call.function.arguments)
review = ProductReview.model_validate(args)
print(json.dumps(review.model_dump(), indent=2))

# review = ProductReview.model_validate(json.loads(response.choices[0].message.content))
# print(json.dumps(review.model_dump(), indent=2))

{
  "product_name": "UltraSound Headphones",
  "rating": 4.0,
  "sentiment": "positive",
  "key_features": [
    "noise cancellation is worst",
    "battery lasts all day",
    "sound quality is crisp and clear"
  ]
}


In [16]:
"""SQL Query Generator"""

class ValidationStatus(BaseModel):
    is_valid: bool
    syntax_errors: list[str]

class SQLQueryGeneration(BaseModel):
    query: str
    query_type: str
    tables_used: list[str]
    estimated_complexity: str
    execution_notes: list[str]
    validation_status: ValidationStatus
    
response = client.chat.completions.create(
    model=MODEL,
    messages= [
        {
            "role": "system",
            "content": "You are a SQL expert. Generate structured SQL queries from natural language descriptions with proper syntax validation and metadata."
        },
        {
            "role": "user",
            "content": "Find all customers who made orders over $500 in the last 30 days, show their name, email, and total order amount."
        }
    ],
    response_format= {
        "type": "json_schema",
        "json_schema": {
            "name": "sql_query_generation",
            "schema": SQLQueryGeneration.model_json_schema()
        }
    },
)

sql_query_generation = SQLQueryGeneration.model_validate(json.loads(response.choices[0].message.content))
print(json.dumps(sql_query_generation.model_dump(), indent=2))

{
  "query": "SELECT c.name, c.email, SUM(o.order_amount) AS total_order_amount FROM customers c JOIN orders o ON c.customer_id = o.customer_id WHERE o.order_amount > 500 AND o.order_date >= CURRENT_DATE - INTERVAL 30 DAY GROUP BY c.name, c.email",
  "query_type": "SELECT",
  "tables_used": [
    "customers",
    "orders"
  ],
  "estimated_complexity": "medium",
  "execution_notes": [
    "This query assumes that the orders table has a foreign key to the customers table.",
    "The query uses a JOIN to link customers with their orders and a WHERE clause to filter orders over $500 in the last 30 days.",
    "The results are grouped by customer name and email to calculate the total order amount for each customer."
  ],
  "validation_status": {
    "is_valid": true,
    "syntax_errors": []
  }
}


In [23]:
"""Support Ticket Classification"""

class SupportCategory(str, Enum):
    API = "api"
    BILLING = "billing"
    ACCOUNT = "account"
    BUG = "bug"
    FEATURE_REQUEST = "feature_request"
    INTEGRATION = "integration"
    SECURITY = "security"
    PERFORMANCE = "performance"

class Priority(str, Enum):
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"
    
class CustomerTier(str, Enum):
    FREE = "free"
    PAID = "paid"
    ENTERPRISE = "enterprise"
    TRIAL = "trial"
    
class CustomerInfo(BaseModel):
    name: str
    company: Optional[str]=None
    tier: CustomerTier

class TechnicalDetails(BaseModel):
    component: str
    error_code: Optional[str]=None
    description: str
    
class SupportTicket(BaseModel):
    category: SupportCategory
    priority: Priority
    urgency_score: float
    customer_info: CustomerInfo
    TechnicalDetails: List[TechnicalDetails]
    keywords: List[str]
    requires_excalation: bool
    estimated_resolution_hours: float
    follow_up_date: Optional[str] = Field(None, description="ISO datetime string")
    summary: str
    
response = client.chat.completions.create(
    model="moonshotai/kimi-k2-instruct",
    messages=[
        {
            "role": "system",
            "content": """You are a customer support ticket classifier for SaaS companies. 
                         Analyze support tickets and categorize them for efficient routing and resolution.
                         Output JSON only using the schema provided.""",
        },
        { 
            "role": "user", 
            "content": """Hello! I love your product and have been using it for 6 months. 
                         I was wondering if you could add a dark mode feature to the dashboard? 
                         Many of our team members work late hours and would really appreciate this. 
                         Also, it would be great to have keyboard shortcuts for common actions. 
                         Not urgent, but would be a nice enhancement! 
                         Best, Mike from StartupXYZ"""
        },
    ],
    response_format= {
        "type": "json_schema",
        "json_schema": {
            "name": "support_ticket_classification",
            "schema": SupportTicket.model_json_schema()
        }
    }
)

raw_result = json.loads(response.choices[0].message.content or "{}")
result = SupportTicket.model_validate(raw_result)
print(result.model_dump_json(indent=2))

{
  "category": "feature_request",
  "priority": "low",
  "urgency_score": 2.0,
  "customer_info": {
    "name": "Mike",
    "company": "StartupXYZ",
    "tier": "paid"
  },
  "TechnicalDetails": [
    {
      "component": "dashboard UI",
      "error_code": null,
      "description": "Customer is requesting a dark mode theme option for the dashboard to reduce eye strain during late-night work sessions"
    },
    {
      "component": "dashboard UI",
      "error_code": null,
      "description": "Customer is also requesting keyboard shortcuts for common actions to improve user workflow efficiency"
    }
  ],
  "keywords": [
    "dark mode",
    "keyboard shortcuts",
    "dashboard",
    "UI enhancement",
    "user experience"
  ],
  "requires_excalation": false,
  "estimated_resolution_hours": 48.0,
  "follow_up_date": null,
  "summary": "Customer from StartupXYZ is requesting two non-urgent UI enhancements: dark mode for the dashboard and keyboard shortcuts for common actions to impr

## Gemini Structured Output

In [25]:
from google import genai
from google.genai import types
from pydantic import BaseModel

In [None]:
client = genai.Client()
MODEL = "gemini-2.0-flash"

class Recipe(BaseModel):
    recipe_name: str
    ingredients: list[str]
    
config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema= list[Recipe]
    )
response = client.models.generate_content(
    model = MODEL,
    contents = [
        ("system", "You are an online chef, who gives recipies of dishes."),
        ("user", "List a few popular cookie recipes, and include the amounts of ingredients."),
    ],
    config=config,
)
print(response.candidates[0].content.parts)

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


[Part(
  text="""[
  {
    "recipe_name": "Chocolate Chip Cookies",
    "ingredients": [
      "2 1/4 cups all-purpose flour",
      "1 teaspoon baking soda",
      "1 teaspoon salt",
      "1 cup (2 sticks) unsalted butter, softened",
      "3/4 cup granulated sugar",
      "3/4 cup packed brown sugar",
      "1 teaspoon vanilla extract",
      "2 large eggs",
      "2 cups chocolate chips"
    ]
  },
  {
    "recipe_name": "Peanut Butter Cookies",
    "ingredients": [
      "1 cup (2 sticks) unsalted butter, softened",
      "1 cup peanut butter",
      "1 cup granulated sugar",
      "1 cup packed brown sugar",
      "2 large eggs",
      "1 teaspoon vanilla extract",
      "2 1/2 cups all-purpose flour",
      "1 teaspoon baking soda",
      "1/2 teaspoon salt"
    ]
  },
  {
    "recipe_name": "Oatmeal Raisin Cookies",
    "ingredients": [
      "1 cup (2 sticks) unsalted butter, softened",
      "3/4 cup packed brown sugar",
      "1/2 cup granulated sugar",
      "2 large eggs",

In [28]:
print(response.text)

[
  {
    "recipe_name": "Chocolate Chip Cookies",
    "ingredients": [
      "2 1/4 cups all-purpose flour",
      "1 teaspoon baking soda",
      "1 teaspoon salt",
      "1 cup (2 sticks) unsalted butter, softened",
      "3/4 cup granulated sugar",
      "3/4 cup packed brown sugar",
      "1 teaspoon vanilla extract",
      "2 large eggs",
      "2 cups chocolate chips"
    ]
  },
  {
    "recipe_name": "Peanut Butter Cookies",
    "ingredients": [
      "1 cup (2 sticks) unsalted butter, softened",
      "1 cup peanut butter",
      "1 cup granulated sugar",
      "1 cup packed brown sugar",
      "2 large eggs",
      "1 teaspoon vanilla extract",
      "2 1/2 cups all-purpose flour",
      "1 teaspoon baking soda",
      "1/2 teaspoon salt"
    ]
  },
  {
    "recipe_name": "Oatmeal Raisin Cookies",
    "ingredients": [
      "1 cup (2 sticks) unsalted butter, softened",
      "3/4 cup packed brown sugar",
      "1/2 cup granulated sugar",
      "2 large eggs",
      "1 teaspoo

In [30]:
class Instrument(str, Enum):
    PERCUSSION = "Percussion"
    STRING = "String"
    WOODWIND = "Woodwind"
    BRASS = "Brass"
    KEYBOARD = "Keyboard"
    
config = types.GenerateContentConfig(
    response_mime_type="text/x.enum",
    response_schema= Instrument
)
response = client.models.generate_content(
    model=MODEL,
    contents = [
        ("user", "what type of instrument is an oboe?"),
    ],
    config = config,
)
print(response.text)

Woodwind


In [31]:
class Grade(Enum):
    A_PLUS = "a+"
    A = "a"
    B = "b"
    C = "c"
    D = "d"
    F = "f"

class Recipe(BaseModel):
    name: str
    grade: Grade
    
response = client.models.generate_content(
    model=MODEL,
    contents= 'List 10 home-baked cookie recipes and give them grades based on tastiness.',
    config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=list[Recipe]
    ),
)

print(response.text)

[
  {
    "name": "Chocolate Chip Cookies",
    "grade": "a+"
  },
  {
    "name": "Peanut Butter Cookies",
    "grade": "a"
  },
  {
    "name": "Oatmeal Raisin Cookies",
    "grade": "b"
  },
  {
    "name": "Sugar Cookies",
    "grade": "b"
  },
  {
    "name": "Snickerdoodle Cookies",
    "grade": "a"
  },
  {
    "name": "Gingerbread Cookies",
    "grade": "b"
  },
  {
    "name": "Shortbread Cookies",
    "grade": "c"
  },
  {
    "name": "Macadamia Nut Cookies",
    "grade": "a"
  },
  {
    "name": "Double Chocolate Cookies",
    "grade": "a+"
  },
  {
    "name": "Lemon Cookies",
    "grade": "b"
  }
]


## Langchain: Structured Output

In [43]:
from langchain.chat_models import init_chat_model
from pydantic import BaseModel, Field
from typing import Optional

llm = init_chat_model(model="gemini-2.0-flash", model_provider="google_genai")

In [39]:
class Joke(BaseModel):
    setup: str = Field(description="The setup of the joke")
    punchline: str = Field(description="The punchline of the joke")
    rating: Optional[int] = Field(default=None, description="how funny the joke is, rate from 1 to 10")

structured_llm = llm.with_structured_output(Joke)
structured_llm.invoke("Tell me a joke about cats")

Joke(setup='Why are cats bad at poker?', punchline="Because they're always feline good!", rating=8)

### Choosing between multiple schemas

In [48]:
from typing import Union


class Joke(BaseModel):
    """Joke to tell user."""

    setup: str = Field(description="The setup of the joke")
    punchline: str = Field(description="The punchline to the joke")
    rating: Optional[int] = Field(
        default=None, description="How funny the joke is, from 1 to 10"
    )


class ConversationalResponse(BaseModel):
    """Respond in a conversational manner. Be kind and helpful."""

    response: str = Field(description="A conversational response to the user's query")


class FinalResponse(BaseModel):
    final_output: Union[Joke, ConversationalResponse]


structured_llm = llm.with_structured_output(FinalResponse)

structured_llm.invoke("Tell me a joke about cats")


FinalResponse(final_output=ConversationalResponse(response='Why did the cat join the Red Cross? Because he wanted to be a first aid kit-ty!'))

In [47]:
structured_llm.invoke("How are you today?")

FinalResponse(final_output=ConversationalResponse(response='I am doing great, thank you for asking! How can I help you today?'))

### PydanticOutputParser

In [49]:
from typing import List

from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    name: str = Field(..., description="The name of the person")
    height_in_meters: float = Field(
        ..., description="The height of the person expressed in meters."
    )


class People(BaseModel):
    """Identifying information about all people in a text."""

    people: List[Person]


# Set up a parser
parser = PydanticOutputParser(pydantic_object=People)

# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
        ),
        ("human", "{query}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

In [50]:
query = "Anna is 23 years old and she is 6 feet tall"

print(prompt.invoke({"query": query}).to_string())

System: Answer the user query. Wrap the output in `json` tags
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"Person": {"description": "Information about a person.", "properties": {"name": {"description": "The name of the person", "title": "Name", "type": "string"}, "height_in_meters": {"description": "The height of the person expressed in meters.", "title": "Height In Meters", "type": "number"}}, "required": ["name", "height_in_meters"], "title": "Person", "type": "object"}}, "description": "Identifying information about all people in a text.", "properties": {"people": {"items"

In [53]:
chain = prompt | llm | parser

chain.invoke({"query": query})

People(people=[Person(name='Anna', height_in_meters=1.8288)])

# End of the notebook