# Training Code to be used

In [30]:
pip install verovio



In [31]:
pip install evaluate



In [32]:
pip install jiwer



In [33]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor, Trainer, TrainingArguments, TrainerCallback
from peft import LoraConfig, get_peft_model, TaskType
import cv2
import evaluate
import json
import numpy as np
from jiwer import cer as cer_jiwer

In [34]:
if __name__ == '__main__':
    def in_colab():
        try:
            import google.colab
            return True
        except ImportError:
            return False

    if in_colab():
        from google.colab import drive
        drive.mount('/content/drive')
        OCR_PATH = '/content/drive/MyDrive/OCR'
    else:
        OCR_PATH = '.'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
MODEL_ID = "stepfun-ai/GOT-OCR-2.0-hf"
OUTPUT_DIR = "./got_ocr_finetuned"

MAX_LENGTH = 2048

BATCH_SIZE = 4
GRAD_ACCUMULATION = 1
LEARNING_RATE = 15 * 1e-5

In [36]:
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=True)

tokenizer = processor.tokenizer

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

image_token_id = model.config.image_token_index
num_image_tokens = model.config.image_seq_length

In [37]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)
model.enable_input_require_grads()
model.print_trainable_parameters()
model.gradient_checkpointing_enable()

trainable params: 1,572,864 || all params: 562,101,504 || trainable%: 0.2798


In [38]:
class GOTOCRDataset(Dataset):
    def __init__(self, data_list, processor, max_length=2048):
        self.data = data_list
        self.processor = processor
        self.tokenizer = processor.tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = os.path.join(OCR_PATH, "Samples", item["image_path"])
        gt = item["ground_truth"]

        image = Image.open(image_path).convert("RGB")

        base = self.processor(image, return_tensors="pt")
        base_input_ids = base["input_ids"][0]
        pixel_values = base["pixel_values"][0]

        answer_text = gt + self.tokenizer.eos_token

        answer_ids = self.tokenizer(
            answer_text,
            return_tensors="pt",
            add_special_tokens=False,
        ).input_ids[0]

        full_input_ids = torch.cat([base_input_ids, answer_ids], dim=0)
        attention_mask = torch.ones_like(full_input_ids)

        assert base_input_ids.shape[0] < self.max_length, "max_length too small; would cut image tokens"
        if full_input_ids.shape[0] > self.max_length:
            full_input_ids = full_input_ids[: self.max_length]
            attention_mask = attention_mask[: self.max_length]

        labels = full_input_ids.clone()
        labels[: base_input_ids.shape[0]] = -100

        return {
            "input_ids": full_input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "labels": labels,
        }

In [39]:
train_data = [
    {'image_path': 'OCR_Train_1.png', 'ground_truth': 'Abog. Jorge Bresanovich Musa\nActuario Judicial'},
    {'image_path': 'OCR_Train_2.png', 'ground_truth': 'Dr. Miguel A. Rodas Ruiz Díaz\nMiembro del Tribunal de Apelación\nCivil y Comercial - 4ta. Sala'},
    {'image_path': 'OCR_Train_3.png', 'ground_truth': 'Firmado digitalmente por: NIDIA\nLETIZIA PAREDES ARIAS (JUEZ/A)'},
    {'image_path': 'OCR_Train_4.png', 'ground_truth': 'BIRIANA BENÍTEZ FARIA\nMiembro\nTribunal de Apelación 2a. Sala Penal'},
    {'image_path': 'OCR_Train_5.png', 'ground_truth': 'Dr. Eugenio Jiménez R\nMinistro'},
    {'image_path': 'OCR_Train_6.png', 'ground_truth': 'Dra. Miryam Peña Candia\nMinistra'},
    {'image_path': 'OCR_Train_7.png', 'ground_truth': 'Abg. Piedad Ozuna Wood Secretaria\nJudicial - C.S.J.'},
    {'image_path': 'OCR_Train_8.png', 'ground_truth': 'Luis María Benítez Riera\nMinistro'},
    {'image_path': 'OCR_Train_9.png', 'ground_truth': 'ELIO RUBEN OVELAR FRUTOS\nJUEZ PENAL DE SENTENCIA'},
    {'image_path': 'OCR_Train_10.png', 'ground_truth': 'ENRIQUE MONGELÓS AQUINO\nMiembro del Tribunal de Apelación\nCivil y Comercial - 4ta. Sala'},
    {'image_path': 'OCR_Train_11.png', 'ground_truth': 'DR. JOSE WALDR SERVÍN\nMiembro del Tribunal Apelación\nPenal, 3ª Sala. Capital'},
    {'image_path': 'OCR_Train_12.png', 'ground_truth': 'Luis María Benítez Riera\nMinistro'},
    {'image_path': 'OCR_Train_13.png', 'ground_truth': 'Abg. Gabriela Mora\nActuaria Judicial'},
    {'image_path': 'OCR_Train_14.png', 'ground_truth': 'Abog. Carol Fernández Cáceres\nActuaria Judicial'},
    {'image_path': 'OCR_Train_15.png', 'ground_truth': 'Jorge L. Barreto Alfonso\nActuario Judicial'},
    {'image_path': 'OCR_Train_16.png', 'ground_truth': 'Dr. Edward Vittone\nMiembro\nTribunal de Cuentas'},
    {'image_path': 'OCR_Train_17.png', 'ground_truth': 'Dr. Linneo Ynsfrán Saldívar\nMiembro 5ta. Sala'},
    {'image_path': 'OCR_Train_18.png', 'ground_truth': 'Dr. Manuel Dejesús Ramírez Candia\nMINISTRO'},
    {'image_path': 'OCR_Train_19.png', 'ground_truth': 'Abog. Karina Penoni\nSecretaria'},
    {'image_path': 'OCR_Train_20.png', 'ground_truth': 'Dr. MANUEL AGUIRRE RODAS\nJUEZ PENAL'},
    {'image_path': 'OCR_Train_21.png', 'ground_truth': 'Abg. Lourdes Nathalia Martínez\nActuaria Judicial'},
    {'image_path': 'OCR_Train_22.png', 'ground_truth': 'Arsenio Coronel Benítez\nPresidente\nTribunal de Cuentas'},
    {'image_path': 'OCR_Train_23.png', 'ground_truth': 'DR. RODRIGO A. ESCOBAR E.\nMiembro del Tribunal de Cuentas\nPrimera Sala'},
    {'image_path': 'OCR_Train_24.png', 'ground_truth': 'TERCERA SALA\nTRIBUNAL DE APELACION PENAL'},
    {'image_path': 'OCR_Train_25.png', 'ground_truth': 'Abg. Ma. Estela Bóveda Curril\nActuaria Judicial'},
    {'image_path': 'OCR_Train_26.png', 'ground_truth': 'Abog. Benita Duarte Olmedo\nActuaria Judicial'},
    {'image_path': 'OCR_Train_27.png', 'ground_truth': 'WILFRIDO PERALTA\nJUEZ PENAL'},
    {'image_path': 'OCR_Train_28.png', 'ground_truth': 'Dr. GIUSEPPE FOSSATI LÓPEZ\nMiembro del Tribunal de Apelación\nCivil y Comercial de la Capital\nCuarta Sala'},
    {'image_path': 'OCR_Train_29.png', 'ground_truth': 'María Luz Martínez Vázquez\nJuez'},
    {'image_path': 'OCR_Train_30.png', 'ground_truth': 'César Antonio Garay'},
    {'image_path': 'OCR_Train_31.png', 'ground_truth': 'Prof. Dra. Ma. Carolina Llanes O.\nMinistra'},
    {'image_path': 'OCR_Train_32.png', 'ground_truth': 'Dr. GIUSEPPE FOSSATI LÓPEZ\nMiembro del Tribunal de Apelación\nCivil y Comercial de la Capital\nCuarta Sala'},
    {'image_path': 'OCR_Train_33.png', 'ground_truth': 'Abg. LOURDES E. PEÑA\nJueza Penal\nLiq. y Sent. Nº 1'},
    {'image_path': 'OCR_Train_34.png', 'ground_truth': 'Abg. Wilfrido Méndez\nActuario Judicial'},
    {'image_path': 'OCR_Train_35.png', 'ground_truth': 'Abg. VICTOR HUGO ALFIERI OURIA\nJuez Penal'},
    {'image_path': 'OCR_Train_36.png', 'ground_truth': 'Abg. Lourdes Nathalia Martinez\nActuaria Judicial'},
    {'image_path': 'OCR_Train_37.png', 'ground_truth': 'Abog. Mercedes Sosa G.\nActuaria Judicial'},
    {'image_path': 'OCR_Train_38.png', 'ground_truth': 'Abog. Darío Báez Ferreira\nJuez Penal'},
    {'image_path': 'OCR_Train_39.png', 'ground_truth': 'VICTOR MANUEL MEDINA S.\nJUEZ'},
    {'image_path': 'OCR_Train_40.png', 'ground_truth': 'Abg. Adriana Sánchez Schlunk\nActuaria Judicial'},
    {'image_path': 'OCR_Train_41.png', 'ground_truth': 'PODER JUDICIAL\nJUZGADO PENAL\nDE SENTENCIA\nNº 22\nAsunción - Paraguay'},
    {'image_path': 'OCR_Train_42.png', 'ground_truth': 'Poder Judicial\nCámara de Apelación\nEn lo Civil y Comercial - 1ª Sala'},
    {'image_path': 'OCR_Train_43.png', 'ground_truth': 'Poder Judicial\nSección Estadística Penal\nCapital'},
    {'image_path': 'OCR_Train_44.png', 'ground_truth': 'GERALDINE CASBS M.\nMiembro Trib. Apelación Laboral'},
    {'image_path': 'OCR_Train_45.png', 'ground_truth': 'Abg. Nancy Aquino\nActuaria Judicial'},
    {'image_path': 'OCR_Train_46.png', 'ground_truth': 'PODER JUDICIAL\nREPÚBLICA DEL PARAGUAY\n2ª SALA\nTRIBUNAL DE CUENTAS'},
    {'image_path': 'OCR_Train_47.png', 'ground_truth': 'Abog. SILVIA SANABRIA\nActuaria Judicial\nCoordinación de Juicios Orales'},
    {'image_path': 'OCR_Train_48.png', 'ground_truth': 'Mónica Ramona Reguera\nActuaria Judicial\nTribunal de Apelación Civil y\nComercial Cuarta Sala'},
    {'image_path': 'OCR_Train_49.png', 'ground_truth': 'Dr. Víctor Ríos Ojeda\nMinistro'},
    {'image_path': 'OCR_Train_50.png', 'ground_truth': 'Dr. A. Martin Avalos Valdez\nMiembro-T Contencioso-Administrativo\n1ra Sala'},
]

val_data = [
    {'image_path': 'Test1.png', 'ground_truth': 'Dr. Mario Y. Maldana Griffith\nMiembro'},
    {'image_path': 'Test2.png', 'ground_truth': 'Abog. Carol Fernández Cáceres\nActuaria Judicial'},
    {'image_path': 'Test3.png', 'ground_truth': 'Dr. AGUSTIN LOVERA CAÑETE\nMIEMBRO'},
    {'image_path': 'Test4.png', 'ground_truth': 'og. SANDRA FARÍAS de FERNÁNDEZ\nJuez Penal de Sentencia\nNo. 3'},
    {'image_path': 'Test5.png', 'ground_truth': 'Abg. Wilfrido Méndez\nActuario Judicial'},
    {'image_path': 'Test6.png', 'ground_truth': 'ALMA MENDEZ DE BUONGERMINI\nMiembro Trib. Apel. Laboral'},
    {'image_path': 'Test7.png', 'ground_truth': 'JULIO CÉSAR CENTENO B.\nMiembro'},
    {'image_path': 'Test8.png', 'ground_truth': 'Eulalia Villalba Ojeda\nAsistente\nEstadística Penal - Asuncion'},
    {'image_path': 'Test9.png', 'ground_truth': 'ABG. LINA CASCO D.\nActuaria Judicial'},
    {'image_path': 'Test10.png', 'ground_truth': 'YOLANDA PORTILLO\nJuez Penal'},
]


In [40]:
# def load_data_from_jsonl(file_path):
#     data = []
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             if line.strip():  # skip empty lines
#                 data.append(json.loads(line))
#     return data

# train_data = load_data_from_jsonl(os.path.join(OCR_PATH, "train.jsonl"))
# val_data = load_data_from_jsonl(os.path.join(OCR_PATH,"val.jsonl"))

In [41]:
dataset = GOTOCRDataset(
    train_data,
    processor,
    max_length=MAX_LENGTH,
)

In [42]:
val_dataset = GOTOCRDataset(
    val_data,
    processor,
    max_length=MAX_LENGTH,
)

In [43]:
def collate_fn(batch):
    pad_id = processor.tokenizer.pad_token_id or processor.tokenizer.eos_token_id

    input_ids = torch.nn.utils.rnn.pad_sequence(
        [b["input_ids"] for b in batch],
        batch_first=True,
        padding_value=pad_id,
    )
    labels = torch.nn.utils.rnn.pad_sequence(
        [b["labels"] for b in batch],
        batch_first=True,
        padding_value=-100,
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [b["attention_mask"] for b in batch],
        batch_first=True,
        padding_value=0,
    )
    pixel_values = torch.stack([b["pixel_values"] for b in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "labels": labels,
    }

In [44]:
cer_metric = evaluate.load("cer")

def _normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = s.replace("\r", "\n")
    lines = [ln.strip() for ln in s.splitlines()]
    s = "\n".join(lines)
    s = " ".join(s.split())
    return s

def evaluate_ocr_dataset(
    model,
    processor,
    examples,
    max_new_tokens: int = 40,
    num_beams: int = 1,
    print_samples: bool = False,
):
    model.eval()
    device = next(model.parameters()).device

    all_preds = []
    all_refs = []

    for ex in examples:
        img_path = os.path.join(OCR_PATH, "Samples", ex["image_path"])
        ref_text = ex["ground_truth"]

        bgr = cv2.imread(img_path)
        if bgr is None:
            print(f"Could not read image: {img_path}")
            continue
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb)

        inputs = processor(pil_image, return_tensors="pt").to(device)

        with torch.no_grad():
            gen_ids = model.generate(
                **inputs,
                do_sample=False,
                num_beams=num_beams,
                length_penalty=1,
                tokenizer=processor.tokenizer,
                stop_strings="<|im_end|>",
                max_new_tokens=max_new_tokens,
                no_repeat_ngram_size=4,
                repetition_penalty=1.1,
            )

        prompt_len = inputs["input_ids"].shape[1]
        gen_seq = gen_ids[0, prompt_len:]

        pred_text = processor.decode(
            gen_seq,
            skip_special_tokens=True,
        )

        ref_norm = _normalize_text(ref_text)
        pred_norm = _normalize_text(pred_text)

        all_refs.append(ref_norm)
        all_preds.append(pred_norm)

        sample_cer = cer_jiwer(ref_norm, pred_norm)

        if print_samples:
            print("=" * 80)
            print(f"IMAGE  : {img_path}")
            print(f"REF    : {ref_norm}")
            print(f"PRED   : {pred_norm}")
            print(f"CER    : {sample_cer:.4f}")

    global_cer = cer_metric.compute(predictions=all_preds, references=all_refs)
    exact = sum(p == r for p, r in zip(all_preds, all_refs)) / len(all_refs)

    print("\n" + "#" * 80)
    print(f"GLOBAL CER      : {global_cer:.4f}")
    print(f"EXACT MATCH RATE: {exact * 100:.2f}%")
    print("#" * 80)

    return {"cer": global_cer, "exact_match": exact}

In [45]:
class CEREvaluationCallback(TrainerCallback):
    def __init__(self, model, processor, eval_dataset, log_steps=False):
        self.model = model
        self.processor = processor
        self.eval_dataset = eval_dataset
        self.log_steps = log_steps

    def on_epoch_end(self, args, state, control, **kwargs):
        """
        Run evaluation at the end of every epoch.
        """
        print(f"\n[Epoch {state.epoch:.0f}] Running CER Evaluation...")

        metrics = evaluate_ocr_dataset(
            model=self.model,
            processor=self.processor,
            examples=self.eval_dataset,
            max_new_tokens=30,
            num_beams=3,
            print_samples=False
        )

        self.model.train()

# Training
Cer measured in Callback. Disable this if you are not preticularily concerned with the metric over epochs: result with ~0.10 cer is more consistent without it and it's faster.

In [46]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=50,
    num_train_epochs=10,
    bf16=True,
    fp16=False,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",
)

cer_callback = CEREvaluationCallback(
    model=model,
    processor=processor,
    eval_dataset=val_data
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
    callbacks=[cer_callback],
)

# Loop is much faster without cer_callback.

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset,
#     data_collator=collate_fn,
# )

In [47]:
print("Starting training...")
trainer.train()

Starting training...




Step,Training Loss
10,2.9825
20,2.2095
30,2.0767
40,1.6924
50,1.6865
60,1.2969
70,1.5056
80,1.2806
90,1.3533
100,1.2156



[Epoch 1] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.2979
EXACT MATCH RATE: 10.00%
################################################################################

[Epoch 2] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.2033
EXACT MATCH RATE: 0.00%
################################################################################

[Epoch 3] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.2009
EXACT MATCH RATE: 10.00%
################################################################################





[Epoch 4] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.2530
EXACT MATCH RATE: 10.00%
################################################################################

[Epoch 5] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.1655
EXACT MATCH RATE: 10.00%
################################################################################

[Epoch 6] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.1726
EXACT MATCH RATE: 0.00%
################################################################################

[Epoch 7] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.1087
EXACT MATCH RATE: 0.00%
################################################################################





[Epoch 8] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.0946
EXACT MATCH RATE: 0.00%
################################################################################

[Epoch 9] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.0969
EXACT MATCH RATE: 0.00%
################################################################################

[Epoch 10] Running CER Evaluation...

################################################################################
GLOBAL CER      : 0.1040
EXACT MATCH RATE: 0.00%
################################################################################


TrainOutput(global_step=130, training_loss=1.5914938779977652, metrics={'train_runtime': 410.9062, 'train_samples_per_second': 1.217, 'train_steps_per_second': 0.316, 'total_flos': 381940830167040.0, 'train_loss': 1.5914938779977652, 'epoch': 10.0})

In [48]:
results = evaluate_ocr_dataset(
    model,
    processor,
    val_data,
    max_new_tokens=30,
    num_beams=3,
    print_samples=True,
)

print(results)

IMAGE  : /content/drive/MyDrive/OCR/Samples/Test1.png
REF    : Dr. Mario Y. Maldana Griffith Miembro
PRED   : Dr. María Y. Maldana Griffith Metro Metro
CER    : 0.2703
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test2.png
REF    : Abog. Carol Fernández Cáceres Actuaria Judicial
PRED   : Abog. Carol Fernández Cáceres Actuaria Judicial 21
CER    : 0.0638
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test3.png
REF    : Dr. AGUSTIN LOVERA CAÑETE MIEMBRO
PRED   : DT. AGUSTIN LOVERA CANÉTE MIEMBRO
CER    : 0.0909
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test4.png
REF    : og. SANDRA FARÍAS de FERNÁNDEZ Juez Penal de Sentencia No. 3
PRED   : ÓG. SÁNDRA FARIAS de FERNAN EZ Pena! de Setencia 1992 1
CER    : 0.3000
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test5.png
REF    : Abg. Wilfrido Méndez Actuario Judicial
PRED   : Abg. Wilfrido Méndez Actuário Judicia
CER    : 0.0526
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test6.png
REF    : ALMA MENDEZ DE BUONGERMINI Miembro Trib. Apel. Laboral
P

In [29]:
print("Saving adapter and processor...")
model.save_pretrained(os.path.join(OCR_PATH, "fine_tuned_model"))
processor.save_pretrained(os.path.join(OCR_PATH, "fine_tuned_model"))

Saving adapter and processor...


[]

# Inference and Part of Evaluating

In [None]:
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
from PIL import Image
import cv2
from accelerate import Accelerator

MODEL_ID = "stepfun-ai/GOT-OCR-2.0-hf"
device = Accelerator().device

# 1) Base model (no LoRA)
base_model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    device_map="cuda",
    torch_dtype=torch.float16
).eval()

processor = AutoProcessor.from_pretrained(MODEL_ID)

test_img = "Test4.png"
image = cv2.imread(os.path.join(OCR_PATH, "Samples", test_img))
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

base_inputs = processor(pil_image, return_tensors="pt").to(device)

with torch.no_grad():
    base_ids = base_model.generate(
        **base_inputs,
        do_sample=False,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=64,
    )

base_text = processor.decode(
    base_ids[0, base_inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("BASE:\n" + base_text)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


BASE:
FERNAN


In [None]:
results = evaluate_ocr_dataset(
    model,
    processor,
    val_data,
    max_new_tokens=30,
    num_beams=3,
    print_samples=True,
)

IMAGE  : /content/drive/MyDrive/OCR/Samples/Test1.png
REF    : Dr. Mario Y. Maldana Griffith Miembro
PRED   : Dr.MarioyMadanaGriffith
CER    : 0.4054
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test2.png
REF    : Abog. Carol Fernández Cáceres Actuaria Judicial
PRED   : A bog. Carol Ferna g lez Caceres Act u arial udi cia l
CER    : 0.2766
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test3.png
REF    : Dr. AGUSTIN LOVERA CAÑETE MIEMBRO
PRED   : DI. AGUST IN LOVER A CANE TE MIEM BRO
CER    : 0.1818
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test4.png
REF    : og. SANDRA FARÍAS de FERNÁNDEZ Juez Penal de Sentencia No. 3
PRED   : FERNAN eZ og. S AND RAF ARIAS de en cia
CER    : 0.7000
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test5.png
REF    : Abg. Wilfrido Méndez Actuario Judicial
PRED   : Abg. Wil fido Mer lez Act u a
CER    : 0.4474
IMAGE  : /content/drive/MyDrive/OCR/Samples/Test6.png
REF    : ALMA MENDEZ DE BUONGERMINI Miembro Trib. Apel. Laboral
PRED   : ALMA MEND EL DE SUO NGER M

In [None]:
model.eval()

ft_inputs = processor(pil_image, return_tensors="pt").to(device)

with torch.no_grad():
    ft_ids = model.generate(
        **ft_inputs,
        do_sample=False,
        num_beams=5,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=24,
        length_penalty=1,
        # TODO: This is purely to prevent OCR: OCR: OCR: ... I may not even need it at this point
        no_repeat_ngram_size=4,
        repetition_penalty=1.1,
    )

ft_text = processor.decode(
    ft_ids[0, ft_inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("FINETUNED:\n" + ft_text)

FINETUNED:
Óg. SAVDRÁ FÁRIAS de FERNÁNREZ F. Pena! de


In [None]:
model.disable_adapter()
with torch.no_grad():
    ids_no_lora = model.generate(
        **ft_inputs,
        do_sample=False,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=64,
    )

txt_no_lora = processor.decode(
    ids_no_lora[0, ft_inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("NO LORA:\n" + txt_no_lora)


NO LORA:
FERNAN
