In [1]:
import os
import re
import ast
import json
import torch
from datetime import datetime
from PyPDF2 import PdfReader

from pydantic import BaseModel, Field, validator
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import outlines
from enum import Enum
from langchain_community.llms import Outlines

from pydantic import BaseModel

In [3]:
model_id = "Qwen/Qwen2.5-3B-Instruct" # HuggingFaceTB/SmolLM2-135M-Instruct, HuggingFaceTB/SmolLM2-1.7B-Instruct, Qwen/Qwen2.5-3B-Instruct

llm = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16),
    AutoTokenizer.from_pretrained(model_id)
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.60s/it]


In [4]:
class RiskLevel(str, Enum):
    low = "low"
    medium = "medium"
    high = "high"


class Investment(BaseModel):
    portfolio_id: str = Field(description="Unique identifier for the investment portfolio")
    portfolio_value: float = Field(description="Current total value of all investments in the portfolio")
    asset_number: str = Field(description="Number of different asset types of the portfolio")
    risk_profile: RiskLevel = Field(description="Assessment of the portfolio's risk level")

###########################################
###########################################

class AccountType(str, Enum):
    checking = "checking"
    savings = "savings"
    money_market = "money_market"
    certificate_of_deposit = "certificate_of_deposit"
    credit = "credit"
    loan = "loan"

class PersonalAccount(BaseModel):
    account_number: str = Field(description="Partially masked bank account number")
    account_type: AccountType = Field(description="Type of account")
    transaction_number: str = Field(description="Number of financial activities affecting the account during the statement period")


###########################################
###########################################

class Garnishment(BaseModel):
    debtor_name: str = Field(description="Name of the individual whose assets are being garnished")
    creditor_name: str = Field(description="Name of the entity (person/organization) to whom the debt is owed")
    effective_date: datetime = Field(description="Date when the garnishment takes effect")

###########################################
###########################################

class Credit(BaseModel):
    card_number: str = Field(description="Partially masked credit card number")
    credit_limit: float = Field(description="Maximum amount of credit extended to the customer")
    interest_rate: float = Field(description="Annual percentage rate applied to outstanding balances")    

In [5]:
# Create Chat Template for Named-Entity Extraction

# Use your Pydantic model
parser_investment = PydanticOutputParser(pydantic_object=Investment)
parser_personalAccount = PydanticOutputParser(pydantic_object=PersonalAccount)
parser_garnishment = PydanticOutputParser(pydantic_object=Garnishment)
parser_credit = PydanticOutputParser(pydantic_object=Credit)

# Auto-generate format instructions for the LLM
format_instructions_investment = parser_investment.get_format_instructions()
format_instructions_personalAccount = parser_personalAccount.get_format_instructions()
format_instructions_garnishment = parser_garnishment.get_format_instructions()
format_instructions_credit = parser_credit.get_format_instructions()

format_instruction_dict = {'investment':format_instructions_investment,
                           'personal_account': format_instructions_personalAccount,
                           'garnishment': format_instructions_garnishment,
                           'credit': format_instructions_credit}

pydantic_dict = {'investment': Investment,
                'personal_account': PersonalAccount,
                'garnishment': Garnishment,
                'credit': Credit}


def create_ner_prompt(document_text: str, doc_class: str):
    """
    Create a structured Named-Entity Extraction prompt for the LLM using the Pydantic schema.
    
    Args:
        document_text (str): The raw text of the document to classify.
    
    Returns:
        str: The final prompt ready to send to the model.
    """
    system_msg = (
        "You are Qwen, created by Alibaba Cloud. "
        "Your task is to extract proper entities from the document according to the provided schema. "
        "Return the result as valid JSON."
    )

    user_msg = (
        f"The document you need to analyze:\n\n{document_text}\n\n"
        f"{format_instruction_dict[doc_class]}"
    )

    # Build the chat template with system + user messages
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", system_msg),
        ("user", "{doc}")
    ])

    return prompt_template.format(doc=user_msg)

In [6]:
sample_doc_name = "doc-02.txt"
sample_doc_class = "investment"
sample_doc_text_path = os.path.join("documents_banking_txt", sample_doc_name)

with open(sample_doc_text_path, "r") as document:
    sample_doc_text = document.read()

print(sample_doc_text)

Banca Italiana di Credito
CD-IT-78901-2025
CERTIFICATO DI DEPOSITO
Data di emissione: 1 marzo 2025
Numero di riferimento: CD-2025-78901
Nome e Cognome: Marco Rossi
Codice Fiscale: RSSMRC75P12H501T
Indirizzo: Via Veneto 45, 00187 Roma, Italia
Telefono: +39 345 678 9012
Dettagli del Certificato
Caratteristica
Dettaglio
Importo depositato
€25.000,00
Durata
24 mesi
Data di scadenza
1 marzo 2027
Tasso di interesse annuo
2,75% (fisso)
Periodicità interessi
Semestrale
Rendimento totale alla scadenza
€1.375,00
Informazioni sul certificato
Gentile Sig. Rossi, con il presente documento confermiamo l'emissione di un Certificato di Deposito
per un importo di €25.000,00 con scadenza il 1 marzo 2027.
Piano di maturazione degli interessi
Data
Interessi (€)
Ritenuta fiscale (€)
Interessi netti (€)
1 settembre 2025
343,75
89,38
254,38
1 marzo 2026
343,75
89,38
254,38
1 settembre 2026
343,75
89,38
254,38
1 marzo 2027
343,75
89,38
254,38
Totale
1.375,00
357,50
1.017,50
Condizioni
• Il certificato di depo

In [7]:
ner_prompt = create_ner_prompt(sample_doc_text, sample_doc_class)

print(ner_prompt)

System: You are Qwen, created by Alibaba Cloud. Your task is to extract proper entities from the document according to the provided schema. Return the result as valid JSON.
Human: The document you need to analyze:

Banca Italiana di Credito
CD-IT-78901-2025
CERTIFICATO DI DEPOSITO
Data di emissione: 1 marzo 2025
Numero di riferimento: CD-2025-78901
Nome e Cognome: Marco Rossi
Codice Fiscale: RSSMRC75P12H501T
Indirizzo: Via Veneto 45, 00187 Roma, Italia
Telefono: +39 345 678 9012
Dettagli del Certificato
Caratteristica
Dettaglio
Importo depositato
€25.000,00
Durata
24 mesi
Data di scadenza
1 marzo 2027
Tasso di interesse annuo
2,75% (fisso)
Periodicità interessi
Semestrale
Rendimento totale alla scadenza
€1.375,00
Informazioni sul certificato
Gentile Sig. Rossi, con il presente documento confermiamo l'emissione di un Certificato di Deposito
per un importo di €25.000,00 con scadenza il 1 marzo 2027.
Piano di maturazione degli interessi
Data
Interessi (€)
Ritenuta fiscale (€)
Interessi ne

In [8]:
pred_doc_entities = llm(
    ner_prompt,
    pydantic_dict[sample_doc_class],
    max_new_tokens=128,
    temperature=0.7,
    repetition_penalty=1.0
    )

print(pred_doc_entities)

pred_doc_entities = pydantic_dict[sample_doc_class].model_validate_json(pred_doc_entities)

print(pred_doc_entities)

{ "portfolio_id": "CD-IT-78901-2025", "portfolio_value": 25000.0, "asset_number": "1", "risk_profile": "medium" }
portfolio_id='CD-IT-78901-2025' portfolio_value=25000.0 asset_number='1' risk_profile=<RiskLevel.medium: 'medium'>
