In [12]:
import os
import re
import torch
import json
from datetime import datetime
from PyPDF2 import PdfReader

from pydantic import BaseModel, Field, validator
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
model_id = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def normalize_euro_format(amount: str) -> str:
    amount = amount.strip()
    amount = re.sub(r"(?i)\b(eur|euro|€)\b\s*", "", amount)
    if re.match(r"^\d{4,},\d{2}$", amount):
        integer, decimal = amount.split(",")
        integer = f"{int(integer):,}".replace(",", ".")
        amount = f"{integer},{decimal}"
    return f"{amount}"

def normalize_date(date_str: str) -> str:
    # Try parsing flexible date formats into DD/MM/YYYY
    try:
        dt = datetime.strptime(date_str.strip(), "%Y-%m-%d")
        return dt.strftime("%d/%m/%Y")
    except:
        try:
            dt = datetime.strptime(date_str.strip(), "%d/%m/%Y")
            return dt.strftime("%d/%m/%Y")
        except:
            return date_str  # fallback

def normalize_time(time_str: str) -> str:
    try:
        dt = datetime.strptime(time_str.strip(), "%H:%M")
        return dt.strftime("%H:%M:%S")
    except:
        try:
            dt = datetime.strptime(time_str.strip(), "%H:%M:%S")
            return dt.strftime("%H:%M:%S")
        except:
            return time_str

In [4]:
class ExtractedInfo(BaseModel):
    type: str = Field(description="Intent type, 'RefundRequest', 'Complaint', 'Change' ")
    amount: str = Field(description="formatted as German-style Euro like '1.240,75 €' Do not use 'Euro' or 'EUR'. Use only the symbol € at the end. Always use dot for thousand separator and comma for decimals.")
    subject: str = Field(description="Product or issue subject")
    date: str = Field(description="Date in DD/MM/YYYY format")
    time: str = Field(default="", description="Optional time in HH:MM:SS format")

    @validator("amount")
    def clean_amount(cls, v): return normalize_euro_format(v)

    @validator("date")
    def clean_date(cls, v): return normalize_date(v)

    @validator("time")
    def clean_time(cls, v): return normalize_time(v)

parser = PydanticOutputParser(pydantic_object=ExtractedInfo)

C:\Users\citak\AppData\Local\Temp\ipykernel_20960\4091590641.py:8: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  @validator("amount")
C:\Users\citak\AppData\Local\Temp\ipykernel_20960\4091590641.py:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  @validator("date")
C:\Users\citak\AppData\Local\Temp\ipykernel_20960\4091590641.py:14: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should mig

In [5]:
prompt_template = PromptTemplate(
    template="""
You are an intelligent assistant. Extract the following fields from the customer message below:

{format_instructions}

Message:
\"\"\"{input}\"\"\"
""",
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [6]:
user_input = "Yesterday (18th Mayy of 2025), I bought a laptop from your store but it’s broken. I want a refund of Euro 3776,88. All these things started around around 2:39 pm"

final_prompt = prompt_template.format(input=user_input)


print(final_prompt)


You are an intelligent assistant. Extract the following fields from the customer message below:

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"type": {"description": "Intent type, 'RefundRequest', 'Complaint', 'Change' ", "title": "Type", "type": "string"}, "amount": {"description": "formatted as German-style Euro like '1.240,75 €' Do not use 'Euro' or 'EUR'. Use only the symbol € at the end. Always use dot for thousand separator and comma for decimals.", "title": "Amount", "type": "string"}, "subject": {"description": "Product or issue subject", "title": "Subject", "typ

In [7]:
inputs = tokenizer(final_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.001, repetition_penalty=1.05)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded)

  attn_output = torch.nn.functional.scaled_dot_product_attention(



You are an intelligent assistant. Extract the following fields from the customer message below:

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"type": {"description": "Intent type, 'RefundRequest', 'Complaint', 'Change' ", "title": "Type", "type": "string"}, "amount": {"description": "formatted as German-style Euro like '1.240,75 €' Do not use 'Euro' or 'EUR'. Use only the symbol € at the end. Always use dot for thousand separator and comma for decimals.", "title": "Amount", "type": "string"}, "subject": {"description": "Product or issue subject", "title": "Subject", "typ

In [8]:
matches = re.findall(r"\{[\s\S]*?\}", decoded)
json_blob = matches[-1] if matches else "{}"
structured_data = parser.parse(json_blob)

print("\n🎯 Extracted JSON:")
print(structured_data.dict())


🎯 Extracted JSON:
{'type': 'RefundRequest', 'amount': '3776,88 €', 'subject': 'laptop', 'date': '18/05/2025', 'time': '14:39'}


In [9]:
### PDF Text Integration

In [13]:
def extract_text_from_pdf(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    text = "\n".join(page.extract_text() or "" for page in reader.pages)
    return text.strip()

In [14]:
pdf_path = "documents/doc-02.pdf"
text_from_pdf = extract_text_from_pdf(pdf_path)

print(text_from_pdf)

Banca Italiana di Credito
 CD-IT-78901-2025
 
 CERTIFICATO DI DEPOSITO
 
Data di emissione: 1 marzo 2025
Numero di riferimento: CD-2025-78901
Nome e Cognome: Marco Rossi
Codice Fiscale: RSSMRC75P12H501T
Indirizzo: Via Veneto 45, 00187 Roma, Italia
Telefono: +39 345 678 9012
Dettagli del Certificato
Caratteristica
Dettaglio
Importo depositato
€25.000,00
Durata
24 mesi
Data di scadenza
1 marzo 2027
Tasso di interesse annuo
2,75% (fisso)
Periodicità interessi
Semestrale
Rendimento totale alla scadenza
€1.375,00
Informazioni sul certificato
Gentile Sig. Rossi, con il presente documento confermiamo l'emissione di un Certificato di Deposito
per un importo di €25.000,00 con scadenza il 1 marzo 2027.
Piano di maturazione degli interessi
Data
Interessi (€)
Ritenuta fiscale (€)
Interessi netti (€)
1 settembre 2025
343,75
89,38
254,38
1 marzo 2026
343,75
89,38
254,38
1 settembre 2026
343,75
89,38
254,38
1 marzo 2027
343,75
89,38
254,38
Totale
1.375,00
357,50
1.017,50

Condizioni
 Il certificato 

In [15]:
prompt = prompt_template.format(input=text_from_pdf)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.001, repetition_penalty=1.05)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [16]:
matches = re.findall(r"\{[\s\S]*?\}", decoded)
json_blob = matches[-1] if matches else "{}"
parsed = parser.parse(json_blob)

print("\n🎯 Extracted Structured Data:")
print(parsed.dict())


🎯 Extracted Structured Data:
{'type': 'RefundRequest', 'amount': '€25.000,00', 'subject': 'Certificato di Deposito', 'date': '01/03/2025', 'time': ''}
