In [1]:
import os
import re
import ast
import json
import torch
from datetime import datetime
from datetime import datetime
from PyPDF2 import PdfReader

from pydantic import BaseModel, Field, validator
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.schema.runnable import RunnableBranch, RunnableLambda, RunnableMap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import utils_chat_template
import utils_structured_output

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"DocumentClasses": {"enum": ["investment", "personal_account", "garnishment", "credit"], "title": "DocumentClasses", "type": "string"}, "DocumentLanguage": {"enum": ["English", "French", "Italian", "Spanish", "German"], "title": "DocumentLanguage", "type": "string"}}, "properties": {"doc_class": {"$ref": "#/$defs/DocumentClasses", "description": "Class of the document. Possible values:\n- investment: Investment-related documents (e.g., portfolio statements, fund details)\n- personal_account: Personal account documents such as statem

In [3]:
import importlib

importlib.reload(utils_chat_template)
importlib.reload(utils_structured_output)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"DocumentClasses": {"enum": ["investment", "personal_account", "garnishment", "credit"], "title": "DocumentClasses", "type": "string"}, "DocumentLanguage": {"enum": ["English", "French", "Italian", "Spanish", "German"], "title": "DocumentLanguage", "type": "string"}}, "properties": {"doc_class": {"$ref": "#/$defs/DocumentClasses", "description": "Class of the document. Possible values:\n- investment: Investment-related documents (e.g., portfolio statements, fund details)\n- personal_account: Personal account documents such as statem

<module 'utils_structured_output' from 'c:\\Users\\citak\\Desktop\\LLMS\\StructuredOutput\\utils_structured_output.py'>

In [4]:
import outlines
from enum import Enum
from langchain_community.llms import Outlines

from pydantic import BaseModel

In [5]:
model_id = "Qwen/Qwen2.5-3B-Instruct" # HuggingFaceTB/SmolLM2-135M-Instruct, HuggingFaceTB/SmolLM2-1.7B-Instruct, Qwen/Qwen2.5-3B-Instruct

llm = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16),
    AutoTokenizer.from_pretrained(model_id)
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.15s/it]


In [6]:
classification_parser = PydanticOutputParser(pydantic_object=utils_structured_output.DocClassification)

classification_chain = (
    RunnableLambda(lambda x: utils_chat_template.create_classification_prompt(x["document_text"]))
    | RunnableLambda(lambda prompt: llm(prompt, max_new_tokens=256, temperature=0.1, do_sample=False))  # Pass generation params here
    | RunnableLambda(lambda raw: (print("\n[RAW LLM OUTPUT - Classification]:", raw), raw)[1])
    | classification_parser
).with_config({"verbose": True})

In [7]:
sample_doc_name = "doc-03.txt"
sample_doc_text_path = os.path.join("documents_banking_txt", sample_doc_name)

with open(sample_doc_text_path, "r") as document:
    sample_doc_text = document.read()

classification_chain_result = classification_chain.invoke({"document_text": sample_doc_text})

print("-"*20)
print(classification_chain_result)
print(classification_chain_result.doc_class)
print(classification_chain_result.language)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



[RAW LLM OUTPUT - Classification]: 


Assistant: ```json
{
  "doc_class": "garnishment",
  "language": "German"
}
```
--------------------
doc_class=<DocumentClasses.garnishment: 'garnishment'> language=<DocumentLanguage.german: 'German'>
DocumentClasses.garnishment
DocumentLanguage.german


In [8]:
def create_ner_chain(doc_class):
    parser = PydanticOutputParser(pydantic_object=utils_chat_template.pydantic_dict[doc_class])

    print(parser)

    return (
        RunnableLambda(lambda x: utils_chat_template.create_ner_prompt(
            x["document_text"], 
            x["classification"].doc_class,   # Pydantic property access
            x["classification"].language     # Pydantic property access
        ))
        | RunnableLambda(lambda prompt: llm(prompt, max_new_tokens=256, temperature=0.1, do_sample=False))  # Pass generation params here
        | RunnableLambda(lambda raw: (print("\n[RAW LLM OUTPUT - NER]:", raw), raw)[1])
        | parser
    ).with_config({"verbose": True})

In [9]:
ner_branch = RunnableBranch(
    (lambda x: x["classification"].doc_class == "investment", create_ner_chain("investment")),
    (lambda x: x["classification"].doc_class == "personal_account", create_ner_chain("personal_account")),
    (lambda x: x["classification"].doc_class == "garnishment", create_ner_chain("garnishment")),
    (lambda x: x["classification"].doc_class == "credit", create_ner_chain("credit")),
    create_ner_chain("personal_account")  # default branch (fallback)
)

def debug_step(x):
    print("\n=== DEBUG STEP ===")
    print("Keys in x:", x.keys())
    print("Document text:", x['document_text'])
    print("Classification type:", type(x["classification"]))
    print("Classification doc_class:", x["classification"].doc_class)
    print("Classification language:", x["classification"].language)
    return x


full_pipeline = (
    RunnableMap({
        "document_text": lambda x: x["document_text"],
        "classification": classification_chain
    })
    | RunnableLambda(debug_step)
    | ner_branch
)


pydantic_object=<class 'utils_structured_output.Investment'>
pydantic_object=<class 'utils_structured_output.PersonalAccount'>
pydantic_object=<class 'utils_structured_output.Garnishment'>
pydantic_object=<class 'utils_structured_output.Credit'>
pydantic_object=<class 'utils_structured_output.PersonalAccount'>


In [10]:
sample_doc_text_path = os.path.join("documents_banking_txt", sample_doc_name)

with open(sample_doc_text_path, "r") as document:
    sample_doc_text = document.read()

result = full_pipeline.invoke({"document_text": sample_doc_text})

print(result)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



[RAW LLM OUTPUT - Classification]: 


Assistant: ```json
{
  "doc_class": "garnishment",
  "language": "German"
}
```

=== DEBUG STEP ===
Keys in x: dict_keys(['document_text', 'classification'])
Document text: Gunar Radel
Obergerichtsvolizieher
Egbert-Schater-Allee 6/7 28447 Schwandort

Zustellungsurkunde
Empfanger:

N26 Bank AG

Voltairestrase 8

10179 Berlin, Germany

(Schuldn.: Frau Prof. Emin Schuler B.A., BenderstraSe 32, 72675 Lemgo)

Pfandungs- und Uberweisungsbeschluss des Amstgerichts vom 16. February
2025, Az. DF 9565/88

rnebst einer beglaubigten Anschrift dieser Zustellungsurkunde habe ich heute im Auttrag
des

Gliubigers: Zimmer GbR, Holm-Margrat-Strabe 57, 06901 Artern

vertreten durch: Rosemann Services, Kristiane-Girschner-Weg 827, 38389
Tuttlingen, Az. 6622207/RX626716

da ich in dem Geschattsraum der oben genannten Bank die Zustellung vorgenomm
hhabe. Die Zustellung erfolgte gemab § 750 ZPO in Verbindung mit § 829 ZPO. Der
‘Schuldner wurde Uber die Pfandung und Uber