In [1]:
import os
import re
import ast
import json
import torch
from datetime import datetime
from datetime import datetime
from PyPDF2 import PdfReader

from pydantic import BaseModel, Field, validator
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import outlines
from enum import Enum
from langchain_community.llms import Outlines

from pydantic import BaseModel

In [3]:
model_id = "Qwen/Qwen2.5-3B-Instruct" # HuggingFaceTB/SmolLM2-135M-Instruct, HuggingFaceTB/SmolLM2-1.7B-Instruct, Qwen/Qwen2.5-3B-Instruct

llm = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16),
    AutoTokenizer.from_pretrained(model_id)
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.88s/it]


In [4]:
# Enum for document classes
class DocumentClasses(str, Enum):
    investment = "investment"
    personal_account = "personal_account"
    garnishment = "garnishment"
    credit = "credit"

# Enum for languages
class DocumentLanguage(str, Enum):
    english = "English"
    french = "French"
    italian = "Italian"
    spanish = "Spanish"
    german = "German"

# Description map for document classes
DOCUMENT_CLASS_DESCRIPTIONS = {
    DocumentClasses.investment: "Investment-related documents (e.g., portfolio statements, fund details)",
    DocumentClasses.personal_account: "Personal account documents such as statements, account closures, or balance details",
    DocumentClasses.garnishment: "Legal notices regarding wage garnishment or account seizure",
    DocumentClasses.credit: "Credit or loan-related documents, including credit card statements and loan agreements"
}

# Pydantic model for classification
class DocClassification(BaseModel):
    doc_class: DocumentClasses = Field(
        description="Class of the document. Possible values:\n"
                    f"- {DocumentClasses.investment.value}: {DOCUMENT_CLASS_DESCRIPTIONS[DocumentClasses.investment]}\n"
                    f"- {DocumentClasses.personal_account.value}: {DOCUMENT_CLASS_DESCRIPTIONS[DocumentClasses.personal_account]}\n"
                    f"- {DocumentClasses.garnishment.value}: {DOCUMENT_CLASS_DESCRIPTIONS[DocumentClasses.garnishment]}\n"
                    f"- {DocumentClasses.credit.value}: {DOCUMENT_CLASS_DESCRIPTIONS[DocumentClasses.credit]}"
    )
    language: DocumentLanguage = Field(
        description="Language of the document. Possible values: English, French, Italian, Spanish, German"
    )

In [5]:
# Create Chat Template for Classification

# Use your Pydantic model
parser = PydanticOutputParser(pydantic_object=DocClassification)

# Auto-generate format instructions for the LLM
format_instructions = parser.get_format_instructions()

print(format_instructions)

def create_classification_prompt(document_text: str):
    """
    Create a structured classification prompt for the LLM using the Pydantic schema.
    
    Args:
        document_text (str): The raw text of the document to classify.
    
    Returns:
        str: The final prompt ready to send to the model.
    """
    system_msg = (
        "You are Qwen, created by Alibaba Cloud. "
        "Your task is to classify the document according to the provided schema. "
        "Return the result as valid JSON."
    )

    user_msg = (
        f"The document you need to analyze:\n\n{document_text}\n\n"
        f"{format_instructions}"
    )

    # Build the chat template with system + user messages
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", system_msg),
        ("user", "{doc}")
    ])

    return prompt_template.format(doc=user_msg)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"DocumentClasses": {"enum": ["investment", "personal_account", "garnishment", "credit"], "title": "DocumentClasses", "type": "string"}, "DocumentLanguage": {"enum": ["English", "French", "Italian", "Spanish", "German"], "title": "DocumentLanguage", "type": "string"}}, "properties": {"doc_class": {"$ref": "#/$defs/DocumentClasses", "description": "Class of the document. Possible values:\n- investment: Investment-related documents (e.g., portfolio statements, fund details)\n- personal_account: Personal account documents such as statem

In [6]:
sample_doc_name = "doc-02.txt"
sample_doc_text_path = os.path.join("documents_banking_txt", sample_doc_name)

with open(sample_doc_text_path, "r") as document:
    sample_doc_text = document.read()

print(sample_doc_text)

Banca Italiana di Credito
CD-IT-78901-2025
CERTIFICATO DI DEPOSITO
Data di emissione: 1 marzo 2025
Numero di riferimento: CD-2025-78901
Nome e Cognome: Marco Rossi
Codice Fiscale: RSSMRC75P12H501T
Indirizzo: Via Veneto 45, 00187 Roma, Italia
Telefono: +39 345 678 9012
Dettagli del Certificato
Caratteristica
Dettaglio
Importo depositato
€25.000,00
Durata
24 mesi
Data di scadenza
1 marzo 2027
Tasso di interesse annuo
2,75% (fisso)
Periodicità interessi
Semestrale
Rendimento totale alla scadenza
€1.375,00
Informazioni sul certificato
Gentile Sig. Rossi, con il presente documento confermiamo l'emissione di un Certificato di Deposito
per un importo di €25.000,00 con scadenza il 1 marzo 2027.
Piano di maturazione degli interessi
Data
Interessi (€)
Ritenuta fiscale (€)
Interessi netti (€)
1 settembre 2025
343,75
89,38
254,38
1 marzo 2026
343,75
89,38
254,38
1 settembre 2026
343,75
89,38
254,38
1 marzo 2027
343,75
89,38
254,38
Totale
1.375,00
357,50
1.017,50
Condizioni
• Il certificato di depo

In [7]:
classification_prompt = create_classification_prompt(sample_doc_text)

print(classification_prompt)

System: You are Qwen, created by Alibaba Cloud. Your task is to classify the document according to the provided schema. Return the result as valid JSON.
Human: The document you need to analyze:

Banca Italiana di Credito
CD-IT-78901-2025
CERTIFICATO DI DEPOSITO
Data di emissione: 1 marzo 2025
Numero di riferimento: CD-2025-78901
Nome e Cognome: Marco Rossi
Codice Fiscale: RSSMRC75P12H501T
Indirizzo: Via Veneto 45, 00187 Roma, Italia
Telefono: +39 345 678 9012
Dettagli del Certificato
Caratteristica
Dettaglio
Importo depositato
€25.000,00
Durata
24 mesi
Data di scadenza
1 marzo 2027
Tasso di interesse annuo
2,75% (fisso)
Periodicità interessi
Semestrale
Rendimento totale alla scadenza
€1.375,00
Informazioni sul certificato
Gentile Sig. Rossi, con il presente documento confermiamo l'emissione di un Certificato di Deposito
per un importo di €25.000,00 con scadenza il 1 marzo 2027.
Piano di maturazione degli interessi
Data
Interessi (€)
Ritenuta fiscale (€)
Interessi netti (€)
1 settembre 

In [8]:
pred_doc_class = llm(
    classification_prompt,
    DocClassification,
    max_new_tokens=128,
    temperature=0.7,
    repetition_penalty=1.0
    )

print(pred_doc_class)

pred_doc_class = DocClassification.model_validate_json(pred_doc_class)

print(pred_doc_class)

{ "doc_class": "investment", "language": "Italian" }
doc_class=<DocumentClasses.investment: 'investment'> language=<DocumentLanguage.italian: 'Italian'>
