OLMOCR:
1. discard figure and keep caption.
2. convert table to markdown.
3. output format not really stable, sometime it ignores table caption, some time the caption of figure comes within square brackets.
```
![Figure 5: Patches with different color drift.](image)
```

In [1]:
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  5.49it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): VisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): VisionMlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): QuickGELUActivation()
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
    )
    (merger): PatchMerger(
      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Seq

In [2]:
import pymupdf
from PIL import Image
# Render page 1 to an image
pixmap = pymupdf.open("./2307.00421.pdf")[6].get_pixmap(dpi=800)
mode = 'RGB'
img = Image.frombytes(mode, [pixmap.width, pixmap.height], pixmap.samples)
width, height = img.size
target_longest_image_dim=1024
if width > height:
    new_width = target_longest_image_dim
    new_height = int(height * target_longest_image_dim / width)
else: 
    new_height = target_longest_image_dim
    new_width = int(width * target_longest_image_dim / height)
img = img.resize((new_width, new_height), Image.LANCZOS)

In [3]:
#image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1024)
import base64
from io import BytesIO

buffered = BytesIO()
img.save(buffered, format="png")
image_base64 = base64.b64encode(buffered.getvalue())

# Build the prompt, using document metadata
anchor_text = get_anchor_text("./2307.00421.pdf", 7, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)

# Build the full prompt
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
        ],
    }
]

# Apply the chat template and processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

inputs = processor(
    text=[text],
    images=[img],
    padding=True,
    return_tensors="pt",
)
inputs = {key: value.to(device) for (key, value) in inputs.items()}


# Generate the output
output = model.generate(
            **inputs,
            #temperature=1,
            max_new_tokens=4096,
            #num_return_sequences=1,
            #do_sample=True,
        )

# Decode the output
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
    new_tokens, skip_special_tokens=True
)

text_output[0]

'{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"We notice that even with 10% random color drift, the attack patches can still maintain an attack rate as reliable as that for the original clean patches. This performance suggests that the patch is relatively robust and can withstand slight color variations during printing. Figure 5 demonstrates a set of attack patches with different noise levels. As the level of noise increases, the patch experiences certain color deviations. However, adjusting the color of the patch is distinct from this process because it uniformly alters all the pixels to maintain the local texture while achieving the desired color.\\n\\n![Figure 5: Patches with different color drift.](image)\\n\\n### 4.3.3 Patch scaling\\n\\nIn practical deployments, it is often the case that the environment can accommodate larger sized attack patches. As demonstrated in the previous sections, training lar