In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()

api_key = os.environ.get("OPENAI_TOKEN")
client = OpenAI(api_key=api_key)

# Questions normalization

In [13]:
instruction = f"""
You are tasked with normalizing text from customer communications with an online cosmetics shop in Czech language. The goal is to refine the text to make it structurally, grammatically, and punctuationally correct while preserving the original meaning and making only minimal necessary changes.

Guidelines for text normalization:
1. Correct obvious spelling errors
2. Fix grammatical mistakes
3. Adjust punctuation where necessary
4. Improve sentence structure if it's unclear
5. Maintain the original tone and style of the message
6. Preserve all information from the original text
7. Make only minimal changes required for clarity and correctness

Here is the original text to be normalized:

<original_text>
{{ORIGINAL_TEXT}}
</original_text>

Please analyze the text for errors and areas that need improvement. Then, make minimal changes to normalize the text while ensuring that all original information remains intact.

Provide your normalized version of the text in json format.
Example:
{{{{
  "normalized_text": "Dobrý den, chtěl bych si objednat rtěnku, ale nevím, jaká barva by se mi hodila. Můžete mi poradit?",
}}}}
"""

In [8]:
text_to_fix = '"Dobrý den, moc se nevyznám v rozdílu mezi Rozjasňujícim anti age make upem a dvousložkovým make upem. Můžete mi prosím vysvětlit hlavní rozdíl a co lépe kryje nedokonalosti? A proč se už neprodává báze pod make up? Čím tedy nahradit?"'

In [19]:
from pydantic import BaseModel, Field, validator

class NormalizedTextResponse(BaseModel):
    normalized_text: str = Field(..., min_length=1, description="The normalized version of the original text")
    
    @validator('normalized_text')
    def validate_normalized_text(cls, v):
        if not v or v.isspace():
            raise ValueError('normalized_text cannot be empty or contain only whitespace')
        return v.strip()

C:\Users\AG\AppData\Local\Temp\ipykernel_19648\622508924.py:6: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator('normalized_text')


In [16]:
def normalize_question(question:str):
    prompt = instruction.format(ORIGINAL_TEXT = question)

    response = client.responses.create(
        model="gpt-4o-2024-11-20",
        input=prompt,
    )

    parsed_response = NormalizedTextResponse.model_validate_json(response.output_text)
    json_output = parsed_response.model_dump_json()

    return json_output

In [20]:
norm_question = normalize_question('"Dobrý den, moc se nevyznám v rozdílu mezi Rozjasňujícim anti age make upem a dvousložkovým make upem. Můžete mi prosím vysvětlit hlavní rozdíl a co lépe kryje nedokonalosti? A proč se už neprodává báze pod make up? Čím tedy nahradit?"')

ValidationError: 1 validation error for NormalizedTextResponse
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n  "normalize...tedy nahradit?"\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid

In [17]:
def create_normalized_question(qa_data):
    """Create embeddings and organize for efficient retrieval"""
    
    enhanced_data = []
    
    for qa in qa_data:
        # Create embeddings
        norm_question = normalize_question(qa['question'])

        # Add to original structure
        enhanced_qa = {
            **qa,  # Original data
            'question_norm': norm_question,
        }
        
        enhanced_data.append(enhanced_qa)
    
    return enhanced_data

In [None]:
import json 

filename = 'data/qa.json'
with open(filename, 'r', encoding='utf-8') as f:
    qa_data = json.load(f)

In [None]:
data_w_norm_questions = create_normalized_question(qa_data)