In [3]:
import torch
import base64
import urllib.request

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

In [None]:
# Define the model and processor names
model_name = "allenai/olmOCR-7B-0225-preview"
processor_name = "Qwen/Qwen2-VL-7B-Instruct"

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained(processor_name, use_fast=False)

# Select the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Empty CUDA cache before moving the model
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)

# Empty CUDA cache again after moving the model
torch.cuda.empty_cache()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# Render page 1 to an image
src = "./source/SASA.pdf"
image_base64 = render_pdf_to_base64png(src, 1, target_longest_image_dim=1024)

# Build the prompt, using document metadata
anchor_text = get_anchor_text(src, 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
print(prompt)


Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally.
Do not hallucinate.
RAW_TEXT_START
Page dimensions: 841.9x595.3
[Image 666x544 to 730x581]
[42x453]______________________________________________________________________________________________________________________________________________________________________________________
[57x441]Item Vendor Article No Article No Color Size Delivery Quantity Unit Suggest Price Unit Cost Discount (%) Amount
[85x429]Article Description Article Status Item category HKD
[42x414]______________________________________________________________________________________________________________________________________________________________________________________
[42x399]SKT SKATER
[48x387]0799099 OTHERS
[57x375]00030 463358 00 110475003001 N/A 3PC 20.12.2024 82 PC 49.90 20.00 59.92 1,640.00

In [7]:
print(anchor_text)

Page dimensions: 841.9x595.3
[Image 666x544 to 730x581]
[42x453]______________________________________________________________________________________________________________________________________________________________________________________
[57x441]Item Vendor Article No Article No Color Size Delivery Quantity Unit Suggest Price Unit Cost Discount (%) Amount
[85x429]Article Description Article Status Item category HKD
[42x414]______________________________________________________________________________________________________________________________________________________________________________________
[42x399]SKT SKATER
[48x387]0799099 OTHERS
[57x375]00030 463358 00 110475003001 N/A 3PC 20.12.2024 82 PC 49.90 20.00 59.92 1,640.00
[85x363]KT TOOTHBRUSH(3-5 AGE), N/A, 3PC
[57x351]00040 618727 00 110475103001 N/A 3PC 20.12.2024 51 PC 49.90 28.20 43.49 1,438.20
[85x339]CHIKAWA TOOTHBRUSH(6-12 AGE), N/A, 3PC
[57x327]00050 619120 110502603001 N/A 3PC 20.12.2024 450 PC 49.90 28.20 4