# Training Code to be used

In [1]:
pip install verovio

Collecting verovio
  Downloading verovio-5.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.2 kB)
Downloading verovio-5.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m8.6/8.7 MB[0m [31m258.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m143.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: verovio
Successfully installed verovio-5.7.0


In [2]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import cv2

In [3]:
if __name__ == '__main__':
    def in_colab():
        try:
            import google.colab
            return True
        except ImportError:
            return False

    if in_colab():
        from google.colab import drive
        drive.mount('/content/drive')
        OCR_PATH = '/content/drive/MyDrive/OCR'
    else:
        OCR_PATH = '.'

Mounted at /content/drive


In [4]:
MODEL_ID = "stepfun-ai/GOT-OCR-2.0-hf"
OUTPUT_DIR = "./got_ocr_finetuned"

MAX_LENGTH = 2048

BATCH_SIZE = 4
GRAD_ACCUMULATION = 1
LEARNING_RATE = 1e-4

In [5]:
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=True)

tokenizer = processor.tokenizer

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

image_token_id = model.config.image_token_index
num_image_tokens = model.config.image_seq_length

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [6]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

trainable params: 3,784,704 || all params: 564,313,344 || trainable%: 0.6707


In [7]:
class GOTOCRDataset(Dataset):
    def __init__(self, data_list, processor, max_length=2048):
        self.data = data_list
        self.processor = processor
        self.tokenizer = processor.tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = os.path.join(OCR_PATH, "Train", item["image_path"])
        gt = item["ground_truth"]

        image = Image.open(image_path).convert("RGB")

        base = self.processor(image, return_tensors="pt")
        base_input_ids = base["input_ids"][0]      # includes *real* image tokens (id = config.image_token_index)
        pixel_values = base["pixel_values"][0]

        answer_text = gt + self.tokenizer.eos_token

        answer_ids = self.tokenizer(
            answer_text,
            return_tensors="pt",
            add_special_tokens=False,
        ).input_ids[0]

        full_input_ids = torch.cat([base_input_ids, answer_ids], dim=0)
        attention_mask = torch.ones_like(full_input_ids)

        assert base_input_ids.shape[0] < self.max_length, "max_length too small; would cut image tokens"
        if full_input_ids.shape[0] > self.max_length:
            full_input_ids = full_input_ids[: self.max_length]
            attention_mask = attention_mask[: self.max_length]

        labels = full_input_ids.clone()
        labels[: base_input_ids.shape[0]] = -100

        return {
            "input_ids": full_input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "labels": labels,
        }

In [8]:
train_data = [
    {'image_path': 'OCR_Train_1.png', 'ground_truth': 'Abog. Jorge Bresanovich Musa\nActuario Judicial'},
    {'image_path': 'OCR_Train_2.png', 'ground_truth': 'Dr. Miguel A. Rodas Ruiz Díaz\nMiembro del Tribunal de Apelación\nCivil y Comercial - 4ta. Sala'},
    {'image_path': 'OCR_Train_3.png', 'ground_truth': 'Firmado digitalmente por: NIDIA\nLETIZIA PAREDES ARIAS (JUEZ/A)'},
    {'image_path': 'OCR_Train_4.png', 'ground_truth': 'BIRIANA BENÍTEZ FARIA\nMiembro\nTribunal de Apelación 2a. Sala Penal'},
    {'image_path': 'OCR_Train_5.png', 'ground_truth': 'Dr. Eugenio Jiménez R\nMinistro'},
    {'image_path': 'OCR_Train_6.png', 'ground_truth': 'Dra. Miryam Peña Candia\nMinistra'},
    {'image_path': 'OCR_Train_7.png', 'ground_truth': 'Abg. Piedad Ozuna Wood Secretaria\nJudicial - C.S.J.'},
    {'image_path': 'OCR_Train_8.png', 'ground_truth': 'Luis María Benítez Riera\nMinistro'},
    {'image_path': 'OCR_Train_9.png', 'ground_truth': 'ELIO RUBEN OVELAR FRUTOS\nJUEZ PENAL DE SENTENCIA'},
    {'image_path': 'OCR_Train_10.png', 'ground_truth': 'ENRIQUE MONGELÓS AQUINO\nMiembro del Tribunal de Apelación\nCivil y Comercial - 4ta. Sala'},
    {'image_path': 'OCR_Train_11.png', 'ground_truth': 'DR. JOSE WALDR SERVÍN\nMiembro del Tribunal Apelación\nPenal, 3ª Sala. Capital'},
    {'image_path': 'OCR_Train_12.png', 'ground_truth': 'Luis María Benítez Riera\nMinistro'},
    {'image_path': 'OCR_Train_13.png', 'ground_truth': 'Abg. Gabriela Mora\nActuaria Judicial'},
    {'image_path': 'OCR_Train_14.png', 'ground_truth': 'Abog. Carol Fernández Cáceres\nActuaria Judicial'},
    {'image_path': 'OCR_Train_15.png', 'ground_truth': 'Jorge L. Barreto Alfonso\nActuario Judicial'},
    {'image_path': 'OCR_Train_16.png', 'ground_truth': 'Dr. Edward Vittone\nMiembro\nTribunal de Cuentas'},
    {'image_path': 'OCR_Train_17.png', 'ground_truth': 'Dr. Linneo Ynsfrán Saldívar\nMiembro 5ta. Sala'},
    {'image_path': 'OCR_Train_18.png', 'ground_truth': 'Dr. Manuel Dejesús Ramírez Candia\nMINISTRO'},
    {'image_path': 'OCR_Train_19.png', 'ground_truth': 'Abog. Karina Penoni\nSecretaria'},
    {'image_path': 'OCR_Train_20.png', 'ground_truth': 'Dr. MANUEL AGUIRRE RODAS\nJUEZ PENAL'},
    {'image_path': 'OCR_Train_21.png', 'ground_truth': 'Abg. Lourdes Nathalia Martínez\nActuaria Judicial'},
    {'image_path': 'OCR_Train_22.png', 'ground_truth': 'Arsenio Coronel Benítez\nPresidente\nTribunal de Cuentas'},
    {'image_path': 'OCR_Train_23.png', 'ground_truth': 'DR. RODRIGO A. ESCOBAR E.\nMiembro del Tribunal de Cuentas\nPrimera Sala'},
    {'image_path': 'OCR_Train_24.png', 'ground_truth': 'TERCERA SALA\nTRIBUNAL DE APELACION PENAL'},
    {'image_path': 'OCR_Train_25.png', 'ground_truth': 'Abg. Ma. Estela Bóveda Curril\nActuaria Judicial'},
    {'image_path': 'OCR_Train_26.png', 'ground_truth': 'Abog. Benita Duarte Olmedo\nActuaria Judicial'},
    {'image_path': 'OCR_Train_27.png', 'ground_truth': 'WILFRIDO PERALTA\nJUEZ PENAL'},
    {'image_path': 'OCR_Train_28.png', 'ground_truth': 'Dr. GIUSEPPE FOSSATI LÓPEZ\nMiembro del Tribunal de Apelación\nCivil y Comercial de la Capital\nCuarta Sala'},
    {'image_path': 'OCR_Train_29.png', 'ground_truth': 'María Luz Martínez Vázquez\nJuez'},
    {'image_path': 'OCR_Train_30.png', 'ground_truth': 'César Antonio Garay'},
    {'image_path': 'OCR_Train_31.png', 'ground_truth': 'Prof. Dra. Ma. Carolina Llanes O.\nMinistra'},
    {'image_path': 'OCR_Train_32.png', 'ground_truth': 'Dr. GIUSEPPE FOSSATI LÓPEZ\nMiembro del Tribunal de Apelación\nCivil y Comercial de la Capital\nCuarta Sala'},
    {'image_path': 'OCR_Train_33.png', 'ground_truth': 'Abg. LOURDES E. PEÑA\nJueza Penal\nLiq. y Sent. Nº 1'},
    {'image_path': 'OCR_Train_34.png', 'ground_truth': 'Abg. Wilfrido Méndez\nActuario Judicial'},
    {'image_path': 'OCR_Train_35.png', 'ground_truth': 'Abg. VICTOR HUGO ALFIERI OURIA\nJuez Penal'},
    {'image_path': 'OCR_Train_36.png', 'ground_truth': 'Abg. Lourdes Nathalia Martinez\nActuaria Judicial'},
    {'image_path': 'OCR_Train_37.png', 'ground_truth': 'Abog. Mercedes Sosa G.\nActuaria Judicial'},
    {'image_path': 'OCR_Train_38.png', 'ground_truth': 'Abog. Darío Báez Ferreira\nJuez Penal'},
    {'image_path': 'OCR_Train_39.png', 'ground_truth': 'VICTOR MANUEL MEDINA S.\nJUEZ'},
    {'image_path': 'OCR_Train_40.png', 'ground_truth': 'Abg. Adriana Sánchez Schlunk\nActuaria Judicial'},
]

In [9]:
dataset = GOTOCRDataset(
    train_data,
    processor,
    max_length=MAX_LENGTH,
)

In [10]:
sample = dataset[0]
print("Decoded input_ids (first 200 chars):")
print(processor.tokenizer.decode(sample["input_ids"][:200]))

print("\nDecoded labels (ignoring -100):")
label_ids = sample["labels"][sample["labels"] != -100]
print(processor.tokenizer.decode(label_ids))

Decoded input_ids (first 200 chars):
<|im_start|>system
You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user
<img><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad><imgpad>

In [11]:
def collate_fn(batch):
    pad_id = processor.tokenizer.pad_token_id or processor.tokenizer.eos_token_id

    input_ids = torch.nn.utils.rnn.pad_sequence(
        [b["input_ids"] for b in batch],
        batch_first=True,
        padding_value=pad_id,
    )
    labels = torch.nn.utils.rnn.pad_sequence(
        [b["labels"] for b in batch],
        batch_first=True,
        padding_value=-100,
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [b["attention_mask"] for b in batch],
        batch_first=True,
        padding_value=0,
    )
    pixel_values = torch.stack([b["pixel_values"] for b in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "labels": labels,
    }

In [12]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=50,
    num_train_epochs=5,
    bf16=True,
    fp16=False,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
)

In [13]:
print("Starting training...")
trainer.train()

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.4586
20,1.7164
30,1.4099
40,1.2265
50,1.1746


TrainOutput(global_step=50, training_loss=1.5972113227844238, metrics={'train_runtime': 86.9821, 'train_samples_per_second': 2.299, 'train_steps_per_second': 0.575, 'total_flos': 153646663311360.0, 'train_loss': 1.5972113227844238, 'epoch': 5.0})

In [69]:
print("Saving adapter and processor...")
model.save_pretrained(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)

Saving adapter and processor...


[]

# Inference

In [24]:
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
from PIL import Image
import cv2
from accelerate import Accelerator

MODEL_ID = "stepfun-ai/GOT-OCR-2.0-hf"
device = Accelerator().device

# 1) Base model (no LoRA)
base_model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    device_map="cuda",
    torch_dtype=torch.float16
).eval()

processor = AutoProcessor.from_pretrained(MODEL_ID)

test_img = "Test1.png"
image = cv2.imread(test_img)
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

base_inputs = processor(pil_image, return_tensors="pt").to(device)

with torch.no_grad():
    base_ids = base_model.generate(
        **base_inputs,
        do_sample=False,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=512,
    )

base_text = processor.decode(
    base_ids[0, base_inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("BASE:\n" + base_text)

BASE:
DR. JOSE WALD IR SERV IN Mi em b rode lT rib un alAp el aci on Penal, 3: Sala. Capital


In [25]:
model.eval()

ft_inputs = processor(pil_image, return_tensors="pt").to(device)

with torch.no_grad():
    ft_ids = model.generate(
        **ft_inputs,
        do_sample=False,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=512,
        # TODO: This is purely to prevent OCR: OCR: OCR: ... I may not even need it at this point
        no_repeat_ngram_size=4,
        repetition_penalty=1.1,
    )

ft_text = processor.decode(
    ft_ids[0, ft_inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("FINETUNED:\n" + ft_text)

FINETUNED:
DR. JOSÉ WAL.DIR SERVÍN
Miembro del Tribunal Apelación
Penal, 3ª Sala. Capital 



In [26]:
model.disable_adapter()
with torch.no_grad():
    ids_no_lora = model.generate(
        **ft_inputs,
        do_sample=False,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=512,
    )

txt_no_lora = processor.decode(
    ids_no_lora[0, ft_inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("NO LORA:\n" + txt_no_lora)


NO LORA:
DR. JOSÉ WAL.DIR SERVÍN
Miembro del Tribunal Apelación
Penal, 3ª Sala. Capital

