In [2]:
if __name__ == '__main__':
    def in_colab():
        try:
            import google.colab
            return True
        except ImportError:
            return False

    if in_colab():
        from google.colab import drive
        drive.mount('/content/drive')
        OCR_PATH = '/content/drive/MyDrive/OCR'
    else:
        OCR_PATH = '.'

Mounted at /content/drive


In [3]:
pip install --upgrade transformers



In [11]:
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
import os
import cv2
from PIL import Image
from accelerate import Accelerator

In [12]:
device = Accelerator().device
model_DIR = "fine_tuned_model"
BASE_MODEL_ID = "stepfun-ai/GOT-OCR-2.0-hf"

processor = AutoProcessor.from_pretrained(os.path.join(OCR_PATH, model_DIR))

model = AutoModelForImageTextToText.from_pretrained(
    BASE_MODEL_ID,
    device_map="cuda",
    torch_dtype=torch.float16
)

model.load_adapter(os.path.join(OCR_PATH, model_DIR))

model.eval()

GotOcr2ForConditionalGeneration(
  (model): GotOcr2Model(
    (vision_tower): GotOcr2VisionEncoder(
      (patch_embed): GotOcr2PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (layers): ModuleList(
        (0-11): 12 x GotOcr2VisionLayer(
          (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attn): GotOcr2VisionAttention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): GotOcr2MLPBlock(
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (act): GELUActivation()
          )
        )
      )
      (neck): GotOcr2VisionNeck(
        (conv1): Conv2d(768, 256, kernel_size=(1, 1), stride=(1

In [16]:
img_path = "OCR_Train_1.png"
image = cv2.imread(os.path.join(OCR_PATH, "Samples", img_path))
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

inputs = processor(pil_image, return_tensors="pt").to(device)

with torch.no_grad():
    ids = model.generate(
        **inputs,
        do_sample=False,
        num_beams=3,
        length_penalty=1,
        tokenizer=processor.tokenizer,
        stop_strings="<|im_end|>",
        max_new_tokens=30,
        no_repeat_ngram_size=4,
        repetition_penalty=1.1,
    )

text = processor.decode(
    ids[0, inputs["input_ids"].shape[1]:],
    skip_special_tokens=True
)
print("Text:\n" + text)

Text:
Abog. Jorge Bresanovich Musa
Agotuario Judicial
