# Pydantic for LLM workflows

## Basics

In [1]:
from pydantic import BaseModel, Field, ValidationError, EmailStr
from typing import Optional
from datetime import date
class UserInput(BaseModel):
    name: str = Field(...,description="The name of the user")
    email: EmailStr = Field(...,description="The email of the user")
    query: str = Field(...,description="The query of the user")
    order_id : Optional[int] = Field(
        None, 
        description="5 digit order number (cannot start with 0)",
        ge=10000,
        le=99999
    )
    purchase_date: Optional[date] = None

In [None]:
## validate input with our simple Pydantic model
user = UserInput(
    name = "abc",
    email ="abc12@gmail.com",
    query ="How can I cancel my subscription?",
)
print(user)

name='abc' email='abc12@gmail.com' query='How can I cancel my subscription?' order_id=None purchase_date=None


In [3]:
# defining fucntions to handle user input validations safely
from typing import Union
def validate_user_input(input_data: dict) -> Union[UserInput, None]:
    try: 
        user_input = UserInput(**input_data)
        print(f"Validated user input: ✅ {input_data}")
        return user_input
    except ValidationError as e:
        print(f"Validation error: ❌ {e}")
        return None
    
# test the function
user_input = {
    'name': '123',
    'email': 'john.doe@example.com',
    'query': 'How can I cancel my subscription?'
}
validate_user_input(user_input)

Validated user input: ✅ {'name': '123', 'email': 'john.doe@example.com', 'query': 'How can I cancel my subscription?'}


UserInput(name='123', email='john.doe@example.com', query='How can I cancel my subscription?', order_id=None, purchase_date=None)

In [4]:
# test the function
user_input = {
    'email': 'john.doe@example.com',
    'query': 'How can I cancel my subscription?'
}
validate_user_input(user_input)

Validation error: ❌ 1 validation error for UserInput
name
  Field required [type=missing, input_value={'email': 'john.doe@examp...ancel my subscription?'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing


In [6]:
# test the function
user_input = {
    'email': 'john.doe@example.com',
    'query': 'How can I cancel my subscription?',
    'name':'abc',
    'order_id': 40000,
    'purchase_date':'2020-10-02'
}
validate_user_input(user_input)

Validated user input: ✅ {'email': 'john.doe@example.com', 'query': 'How can I cancel my subscription?', 'name': 'abc', 'order_id': 40000, 'purchase_date': '2020-10-02'}


UserInput(name='abc', email='john.doe@example.com', query='How can I cancel my subscription?', order_id=40000, purchase_date=datetime.date(2020, 10, 2))

# Validating Json + Data fields
- using model_validate_json

In [10]:
json_data_good = '''
{
  "email": "john.doe@example.com",
  "query": "How can I cancel my subscription?",
  "name": "abc",
  "order_id": 40000,
  "purchase_date": "2020-10-02"
}
'''
json_data_bad_json_fmt = '''

  "email": "john.doe@example.com",
  "query": "How can I cancel my subscription?",
  "name": "abc",
  "order_id": 40000,
  "purchase_date": "2020-10-02"
}
'''

json_data_bad_data_fmt = '''
{
  "email": "john.doe@example.com",
  "query": "How can I cancel my subscription?",
  "name": 123,
  "order_id": 03000,
  "purchase_date": "2020-10-02"
}
'''

print(f'Validating good json: ✅ {UserInput.model_validate_json(json_data_good)}')
print(f'Validating json with bad JSON formatting: ‼️ {UserInput.model_validate_json(json_data_bad_json_fmt)}')

Validating good json: ✅ name='abc' email='john.doe@example.com' query='How can I cancel my subscription?' order_id=40000 purchase_date=datetime.date(2020, 10, 2)


ValidationError: 1 validation error for UserInput
  Invalid JSON: trailing characters at line 3 column 10 [type=json_invalid, input_value='\n\n  "email": "john.doe...ate": "2020-10-02"\n}\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid

In [12]:
print(f'Validating json with bad JSON formatting: ‼️ {UserInput.model_validate_json(json_data_bad_data_fmt)}')

ValidationError: 1 validation error for UserInput
  Invalid JSON: invalid number at line 6 column 16 [type=json_invalid, input_value='\n{\n  "email": "john.do...ate": "2020-10-02"\n}\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid

# Validating LLM response with Pydantic

In [15]:
# imports
from pydantic import BaseModel, ValidationError, Field, EmailStr
from typing import List, Literal, Optional
import json
from datetime import date
from dotenv import load_dotenv
import openai
load_dotenv()

True

In [16]:
client = openai.OpenAI()

In [19]:
user_input_json = '''
{
  "email": "john.doe@example.com",
  "query": "How can I cancel my subscription?",
  "name": "abc",
  "order_id": null,
  "purchase_date": null
}
'''

In [18]:
class UserInput(BaseModel):
    name: str = Field(...,description="The name of the user")
    email: EmailStr = Field(...,description="The email of the user")
    query: str = Field(...,description="The query of the user")
    order_id : Optional[int] = Field(
        None, 
        description="5 digit order number (cannot start with 0)",
        ge=10000,
        le=99999
    )
    purchase_date: Optional[date] = None

In [20]:
user_input = UserInput.model_validate_json(user_input_json)

In [None]:
# Customer Query Model
class CustomerQuery(UserInput):
    priority:str = Field(...,
                         description="Priority level: low, medium, high")
    category: Literal['refund_request', 'information_request', 'other'] = Field(...,
                                                                                description='Query Category')
    is_complaint: bool = Field(...,
                               description='Whether this is a complaint or not?')
    tags: List[str] = Field(...,
                             description='Relevant keywords tags')

In [22]:
## give an example response to LLM to let it know what you expect as output
example_response_structure = f"""{{
    name ="Example User",
    email = "user@example.com"
    query = "I ordered a new computer monitor and it arrived with a broken screen"
    order_id = 12345
    purchase_date = "2023-11-10"
    priority = "medium"
    category = "refund_request"
    is_complaint = True
    tags = ["monitor","support","exchange"]
}}"""

In [25]:
llm_prompt = f""" 
Please analyse this user query\n{user_input.model_dump_json(indent=2)}:

Return your analysis as a JSON object matching this exact structure
and datatypes:
{example_response_structure}

Respond ONLY with valid JSON. Do not include andy explanations or 
other text or formatting before or after JSON object
"""
print(llm_prompt)

 
Please analyse this user query
{
  "name": "abc",
  "email": "john.doe@example.com",
  "query": "How can I cancel my subscription?",
  "order_id": null,
  "purchase_date": null
}:

Return your analysis as a JSON object matching this exact structure
and datatypes:
{
    name ="Example User",
    email = "user@example.com"
    query = "I ordered a new computer monitor and it arrived with a broken screen"
    order_id = 12345
    purchase_date = "2023-11-10"
    priority = "medium"
    category = "refund_request"
    is_complaint = True
    tags = ["monitor","support","exchange"]
}

Respond ONLY with valid JSON. Do not include andy explanations or 
other text or formatting before or after JSON object



In [26]:
# call LLM
def call_llm(prompt, model="gpt-4o"):
    response = client.chat.completions.create(
        model = model,
        messages=[{"role":"user", "content":prompt}]
    )
    return response.choices[0].message.content

In [27]:
response_content = call_llm(llm_prompt)
print(response_content)

```json
{
    "name": "abc",
    "email": "john.doe@example.com",
    "query": "How can I cancel my subscription?",
    "order_id": null,
    "purchase_date": null,
    "priority": "high",
    "category": "subscription_cancellation",
    "is_complaint": false,
    "tags": ["subscription", "cancellation", "support"]
}
```


In [29]:
CustomerQuery.model_validate_json(response_content)

ValidationError: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n    "name": ...on", "support"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid

In [30]:
# function to catch validation errors gracefully
def validate_llm_responses(data_model, llm_response):
    try:
        validated_data = data_model.model_validate_json(llm_response)
        print(f"Data validation Successfull: ✅")
        print(validated_data.model_dump_json(indent=2))
        return validated_data, None
    except ValidationError as e:
        print(f"❌ Error Validating data: {e}")
        error_message = {
            f"This response generated a validation error: {e}"
        }
        return None, error_message

In [37]:
validated_data, validation_error = validate_llm_responses(CustomerQuery, response_content)
print(validation_error)

❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n    "name": ...on", "support"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
{'This response generated a validation error: 1 validation error for CustomerQuery\n  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value=\'```json\\n{\\n    "name": ...on", "support"]\\n}\\n```\', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid'}


In [33]:
# Create a retry feedback loop
def create_retry_prompt(
    original_prompt,
    original_response, 
    error_message
):
    retry_prompt = f""" 
    There is a request to fix an error in the structure of llm response. 
    Here is the original request
    <original_prompt>
    {original_prompt}
    </original_prompt>
    
    Here is the original llm response:
    <llm_response>
    {original_response}
    </llm_response>
    
    This response generated an error:
    <error_message>
    {error_message}
    </error_message>
    
    Compare the error message and the llm_response and identify what needs to be fixed or removed
    in the llm_response to resolve this error.
    
    Respond ONLY with valid JSON. Do not include any explanations or other text or formatting before or after the JSON string
    """
    return retry_prompt

In [38]:
validation_retry_prompt = create_retry_prompt(
    original_prompt=llm_prompt,
    original_response=response_content,
    error_message=validation_error
)
print(validation_retry_prompt)

 
    There is a request to fix an error in the structure of llm response. 
    Here is the original request
    <original_prompt>
     
Please analyse this user query
{
  "name": "abc",
  "email": "john.doe@example.com",
  "query": "How can I cancel my subscription?",
  "order_id": null,
  "purchase_date": null
}:

Return your analysis as a JSON object matching this exact structure
and datatypes:
{
    name ="Example User",
    email = "user@example.com"
    query = "I ordered a new computer monitor and it arrived with a broken screen"
    order_id = 12345
    purchase_date = "2023-11-10"
    priority = "medium"
    category = "refund_request"
    is_complaint = True
    tags = ["monitor","support","exchange"]
}

Respond ONLY with valid JSON. Do not include andy explanations or 
other text or formatting before or after JSON object

    </original_prompt>

    Here is the original llm response:
    <llm_response>
    ```json
{
    "name": "abc",
    "email": "john.doe@example.com",
   

In [39]:
validation_retry_response = call_llm(validation_retry_prompt)
print(validation_retry_response)

```json
{
    "name": "abc",
    "email": "john.doe@example.com",
    "query": "How can I cancel my subscription?",
    "order_id": null,
    "purchase_date": null,
    "priority": "high",
    "category": "subscription_cancellation",
    "is_complaint": false,
    "tags": ["subscription", "cancellation", "support"]
}
```


In [41]:
# Define a function to automatically retry an LLM call multiple times
def validate_llm_response(
    prompt, data_model, n_retry=5, model="gpt-4o"
):
    # Initial LLM call
    response_content = call_llm(prompt, model=model)
    current_prompt = prompt

    # Try to validate with the model
    # attempt: 0=initial, 1=first retry, ...
    for attempt in range(n_retry + 1):

        validated_data, validation_error = validate_llm_responses(
            data_model, response_content
        )

        if validation_error:
            if attempt < n_retry:
                print(f"retry {attempt} of {n_retry} failed, trying again...")
            else:
                print(f"Max retries reached. Last error: {validation_error}")
                return None, (
                    f"Max retries reached. Last error: {validation_error}"
                )

            validation_retry_prompt = create_retry_prompt(
                original_prompt=current_prompt,
                original_response=response_content,
                error_message=validation_error
            )
            response_content = call_llm(
                validation_retry_prompt, model=model
            )
            current_prompt = validation_retry_prompt
            continue

        # If you get here, both parsing and validation succeeded
        return validated_data, None

In [42]:
# Test your complete solution with the original prompt
validated_data, error = validate_llm_response(
    llm_prompt, CustomerQuery
)

❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n    "name": ...on", "support"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
retry 0 of 5 failed, trying again...
❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n    "name": ...on", "support"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
retry 1 of 5 failed, trying again...
❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n    "name": ...on", "support"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
retry 2 of 5 failed, trying again...
❌

In [44]:
# Investigate the model_json_schema for CustomerQuery
data_model_schema = json.dumps(
    CustomerQuery.model_json_schema(), indent=2
)
print(data_model_schema)

{
  "properties": {
    "name": {
      "description": "The name of the user",
      "title": "Name",
      "type": "string"
    },
    "email": {
      "description": "The email of the user",
      "format": "email",
      "title": "Email",
      "type": "string"
    },
    "query": {
      "description": "The query of the user",
      "title": "Query",
      "type": "string"
    },
    "order_id": {
      "anyOf": [
        {
          "maximum": 99999,
          "minimum": 10000,
          "type": "integer"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "5 digit order number (cannot start with 0)",
      "title": "Order Id"
    },
    "purchase_date": {
      "anyOf": [
        {
          "format": "date",
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Purchase Date"
    },
    "priority": {
      "description": "Priority level: low, medi

In [45]:
# Create new prompt with user input and model_json_schema
prompt = f"""
Please analyze this user query\n {user_input.model_dump_json(indent=2)}:

Return your analysis as a JSON object matching the following schema:
{data_model_schema}

Respond ONLY with valid JSON. Do not include any explanations or 
other text or formatting before or after the JSON object.
"""

In [46]:
# Run your validate_llm_response function with the new prompt
final_analysis, error = validate_llm_response(
    prompt, CustomerQuery
)

❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "name": "a..."cancellation"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
retry 0 of 5 failed, trying again...
❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "name": "a..."cancellation"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
retry 1 of 5 failed, trying again...
❌ Error Validating data: 1 validation error for CustomerQuery
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "name": "a..."cancellation"]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
retry 2 of 5 failed, trying again...
❌

# Using instructor to validate LLM responses with pydantic

In [47]:
from pydantic import BaseModel, Field, EmailStr
from typing import List, Literal, Optional
from openai import OpenAI
import instructor
import anthropic
from dotenv import load_dotenv
from datetime import date

In [48]:
class UserInput(BaseModel):
    name: str = Field(...,description="The name of the user")
    email: EmailStr = Field(...,description="The email of the user")
    query: str = Field(...,description="The query of the user")
    order_id : Optional[int] = Field(
        None, 
        description="5 digit order number (cannot start with 0)",
        ge=10000,
        le=99999
    )
    purchase_date: Optional[date] = None

In [49]:
# Customer Query Model
class CustomerQuery(UserInput):
    priority:str = Field(...,
                         description="Priority level: low, medium, high")
    category: Literal['refund_request', 'information_request', 'other'] = Field(...,
                                                                                description='Query Category')
    is_complaint: bool = Field(...,
                               description='Whether this is a complaint or not?')
    tags: List[str] = Field(...,
                             description='Relevant keywords tags')

In [59]:
user_input_json = '''
{
  "email": "john.doe@example.com",
  "query": "I ordered a product but it didn't arrive and now I want to cancel my order",
  "name": "abc",
  "order_id": 12345,
  "purchase_date": null
}
'''

In [60]:
user_input=  UserInput.model_validate_json(user_input_json)

In [61]:
prompt = (
    f"Analyze the following customer query {user_input} "
    f"and provide a structured response."
)

In [65]:
load_dotenv()

openai_client = instructor.from_openai(
    openai.OpenAI()
)
response = openai_client.beta.chat.completions.parse(
    model = 'gpt-4o',
    max_tokens = 1024,
    messages = [
        {
            'role':'user',
            'content':prompt
        }
    ],
    response_format = CustomerQuery,
)

In [66]:
print(response.choices[0].message.content)

{"name":"abc","email":"john.doe@example.com","query":"I ordered a product but it didn't arrive and now I want to cancel my order","order_id":12345,"purchase_date":null,"priority":"high","category":"refund_request","is_complaint":true,"tags":["order cancellation","delivery issue","refund"]}


In [67]:
response = openai_client.responses.parse(
    model = 'gpt-4o',
    input = [
        {
            'role':'user',
            'content':prompt
        }
    ],
    text_format = CustomerQuery,
)

In [70]:
print(response.output_text)

{"name":"abc","email":"john.doe@example.com","query":"I ordered a product but it didn't arrive and now I want to cancel my order","order_id":12345,"purchase_date":null,"priority":"high","category":"refund_request","is_complaint":true,"tags":["order","cancellation","delivery_issue","refund"]}


# Pydantic for Tool Calling

In [71]:
from pydantic import BaseModel, Field, EmailStr, field_validator
from pydantic_ai import Agent
from typing import List, Literal, Optional
from datetime import datetime, date
import json
from openai import OpenAI
import anthropic
import instructor
from dotenv import load_dotenv
import nest_asyncio

load_dotenv()
nest_asyncio.apply()

In [87]:
class UserInput(BaseModel):
    name: str = Field(...,description="The name of the user")
    email: EmailStr = Field(...,description="The email of the user")
    query: str = Field(...,description="The query of the user")
    order_id : Optional[str] = Field(
        None, 
        description="Order ID if available (format: ABC-12345)"
    )
    # Validate order_id format (e.g should be of format ABC-12345)
    @field_validator('order_id')
    def validate_order_id(cls, order_id):
        import re
        if order_id is None:
            return order_id
        pattern = r"^[A-Z]{3}-\d{5}$"
        if not re.match(pattern, order_id):
            raise ValueError(
                "order_id must be in format ABC-12345"
                "(3 uppercase letters, dash, 5 digits)"
            )
    purchase_date: Optional[date] = None

In [88]:
# Customer Query Model
class CustomerQuery(UserInput):
    priority:str = Field(...,
                         description="Priority level: low, medium, high")
    category: Literal['refund_request', 'information_request', 'other'] = Field(...,
                                                                                description='Query Category')
    is_complaint: bool = Field(...,
                               description='Whether this is a complaint or not?')
    tags: List[str] = Field(...,
                             description='Relevant keywords tags')

In [89]:
from typing import Union
def validate_user_input(input_data: str) -> Union[UserInput, None]:
    try: 
        user_input = UserInput.model_validate_json(input_data)
        print(f"Validated user input: ✅ {input_data}")
        return user_input
    except ValidationError as e:
        print(f"Validation error: ❌ {e}")
        return None

In [90]:
def create_customer_query(valid_user_json: str) -> CustomerQuery:
    customer_query_agent = Agent(
        model = "gpt-4o",
        output_type=CustomerQuery
    )
    response = customer_query_agent.run_sync(valid_user_json)
    print("CustomerQuery generated..")
    return response.output

In [91]:
user_input_json = '''
{
  "email": "john.doe@example.com",
  "query": "I ordered a product but it didn't arrive and now I want to cancel my order",
  "name": "abc",
  "order_id": "ABC-12355",
  "purchase_date": null
}
'''

In [92]:
valid_data = validate_user_input(user_input_json).model_dump_json()
customer_query = create_customer_query(valid_data)
print(customer_query.model_dump_json(indent=2))


Validated user input: ✅ 
{
  "email": "john.doe@example.com",
  "query": "I ordered a product but it didn't arrive and now I want to cancel my order",
  "name": "abc",
  "order_id": "ABC-12355",
  "purchase_date": null
}

CustomerQuery generated..
{
  "name": "abc",
  "email": "john.doe@example.com",
  "query": "I ordered a product but it didn't arrive and now I want to cancel my order",
  "order_id": null,
  "purchase_date": null,
  "priority": "high",
  "category": "refund_request",
  "is_complaint": true,
  "tags": [
    "order issue",
    "cancellation",
    "non-delivery"
  ]
}


In [93]:
# Create an FAQ lookup agent
class FAQLookupArgs(BaseModel):
    query:str = Field(..., description="User's query")
    tags: List[str] = Field(..., description="Relevant keyword tags from the customer query"
                            )

In [None]:
# Class to check order status
class CheckOrderStatusArgs(BaseModel):
    order_id:str = Field(...,
                         description = "Customer's order ID (format: ABC-12345)")
    email: EmailStr = Field(..., description="Customer's email address")
    @field_validator('order_id')
    def validate_order_id(cls, order_id):
        import re
        if order_id is None:
            return order_id
        pattern = r"^[A-Z]{3}-\d{5}$"
        if not re.match(pattern, order_id):
            raise ValueError(
                "order_id must be in format ABC-12345"
                "(3 uppercase letters, dash, 5 digits)"
            )
        return order_id