## Output structured using an LLM

We need to configure a key in:

https://platform.openai.com/api-keys


In [7]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import instructor
from typing import Union
load_dotenv()

# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_AP'))

from pydantic import BaseModel, Field, field_validator, model_validator
from typing import ClassVar

# Patch the OpenAI client
client = instructor.from_openai(client)

### Example 1

Notice how we can leverage pydantic to define a schema desired in response to our model, and also implement some basic validation.

In [13]:

class UserInfo(BaseModel):
    country: str = Field(description="The country where the user is located")
    city: str = Field(description="The unique identifier for the city where the user is located")
    language: str = Field(description="The primary language spoken by the user")
    age: int= Field(...,description="The age of the user", gt=0, lt=120)

    model_config: ClassVar[dict] = {
        "json_schema_extra": {
            "examples": [
                {
                    "country": "united states",
                    "city": "new york",
                    "language": "english",
                    "age": 30
                }
            ]
        }
    }


In [14]:


# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=UserInfo,
    messages= [{"role": "system", "content": "Please identify entities in the input of user"},
        {"role": "user", "content": "I am Eduardo from Miami, United States and I speak English, I am 200"}
    ],
    max_retries=2
)

print(user_info.model_dump())


{'country': 'united states', 'city': 'miami', 'language': 'english', 'age': 119}


### Example 2

Now suppose we need to enforce for any particular reason the output as UPPERCASE.So we can deep dive in validation of pydantic

In [18]:
class UserInfo(BaseModel):
    country: str = Field(description="The country where the user is located")
    city: str = Field(description="The unique identifier for the city where the user is located")
    language: str = Field(description="The primary language spoken by the user")
    age: int= Field(...,description="The age of the user", gt=0, lt=120)

    model_config: ClassVar[dict] = {
        "json_schema_extra": {
            "examples": [
                {
                    "country": "united state",
                    "city": "new york",
                    "language": "english",
                    "age": 30
                }
            ]
        }
    }

    @field_validator('country', 'city', 'language')
    @classmethod
    def validate_and_uppercase_fields(cls, value: str) -> str:
        return value.upper()

    @model_validator(mode='after')
    def validate_age(self) -> 'UserInfo':
        if self.age <= 0 or self.age >= 120:
            raise ValueError('Age must be between 1 and 119')
        return self


In [23]:

# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=UserInfo,
    messages= [{"role": "system", "content": "Please identify entities in the input of user"},
        {"role": "user", "content": "I am Eduardo from Miami, United States and I speak English, also i am -200 year old"}
    ],
    max_retries=2
)

print(user_info.model_dump())

{'country': 'UNITED STATES', 'city': 'MIAMI', 'language': 'ENGLISH', 'age': 30}


as we notice we, enforce our output as we want.

### Example 3

In [29]:
from enum import Enum

class SentimentEnum(str, Enum):
    POSITIVE = "positive"
    NEUTRAL = "neutral"

class TextInfo(BaseModel):
    sentiment: SentimentEnum = Field(description="The sentiment of the text")
    length_of_text: int = Field(description="The length of the text in characters")

    model_config: ClassVar[dict] = {
        "json_schema_extra": {
            "examples": [
                {
                    "sentiment": "positive",
                    "length_of_text": 123
                }
            ]
        }
    }

    @model_validator(mode='after')
    def validate_length_of_text(self) -> 'TextInfo':
        if self.length_of_text <= 0:
            raise ValueError('Length of text must be positive')
        return self


In [30]:

# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=TextInfo,
    messages= [{"role": "system", "content": "Indentifica el sentiemiento y la longitud del texto"},
        {"role": "user", "content": "El dia de hoy fue un dia muy bueno, me siento muy feliz"}
    ],
    max_retries=2
)

print(user_info.model_dump())

{'sentiment': <SentimentEnum.POSITIVE: 'positive'>, 'length_of_text': 66}


In [31]:

# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=TextInfo,
    messages= [{"role": "system", "content": "Indentifica el sentiemiento y la longitud del texto"},
        {"role": "user", "content": "La verdad hoy me he sentido algo triste, no se que hacer"}
    ],
    max_retries=2
)

print(user_info.model_dump())

{'sentiment': <SentimentEnum.NEUTRAL: 'neutral'>, 'length_of_text': 55}


### example 4

In [41]:
from typing import List

class ListSchema(BaseModel):
    feelings: List[str] = Field(description="A list of string items")

    model_config: ClassVar[dict] = {
        "json_schema_extra": {
            "examples": [
                {
                    "feelings": ["happy", "sad", "angry", "excited"]
                }
            ]
        }
    }

In [42]:
prompt_system = """actua como un identificador de identidades y lista los sentimientos en los chats escritos por los usuarios
solo identifica las entidades en el texto
"""

prompt_user = """Me comunicque ayer para si me podrian ayudar con el problema de conexion pero la atencion no fue nada buena, estoy muy molesto y enojado,
luego me pasaron a otro operador y luego otro, la verdad terminé frustrado
"""

# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=ListSchema,
    messages= [{"role": "system", "content": prompt_system},
        {"role": "user", "content": prompt_user}
    ],
    max_retries=2
)

print(user_info.model_dump())

{'feelings': ['molesto', 'enojado', 'frustrado']}


As we notice we can restricted our ouput as a list of feeling in spanish