In [1]:
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText

In [2]:
model_path = "nanonets/Nanonets-OCR-s"

model = AutoModelForImageTextToText.from_pretrained(
    model_path, 
    torch_dtype="auto", 
    device_map="auto", 
    #ttn_implementation="flash_attention_2"
)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
            (act_fn): SiLU()

In [4]:

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path,use_fast=True)
from transformers.image_utils import load_image


In [5]:
image_path = "../data/table.jpg"

In [14]:
prompt_text = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes. For table use <table></table> tags."""
image = load_image(image_path)
messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt_text},
        ]},
    ]

In [15]:
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], padding= True,return_tensors="pt")
inputs = inputs.to("cuda")

In [16]:
generated_ids = model.generate(**inputs, max_new_tokens=15000, do_sample=False)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [17]:
prompt_length = inputs.input_ids.shape[1]

In [18]:
trimmed_generated_ids = generated_ids[:, prompt_length:]

In [19]:
output = processor.batch_decode(trimmed_generated_ids,skip_special_tokens=True)[0].lstrip()

In [20]:
print(output)

A. Crépet et al.
International Journal of Hygiene and Environmental Health 222 (2019) 291–306

**Table 1**
Description of consumption and concentration data for nine different European countries. n = number of individuals in the overall consumption survey. N = number of individuals included in this study (adults 18–64 years old, children 11–15 years old). N = number of substances in steatosis CAG after matching with contamination data; the number in brackets indicates the number of substances with measurements ≥ LOD. No national monitoring data was available for Spain (SP) and the United Kingdom (UK).

| Country | Consumption survey | Consumption data used for the study | National concentration survey |
|---|---|---|---|
|  | Method | Years | Name | Population | n total | Mean age | Weight mean | n | Years | N | Number of measurements | Measurements ≥ LOD in total measurements (%) |
|  |  |  |  | Adults (14-105 years) | 3214 | 40 (18-64) | 71.4 (69-133) | 1356 | 2011-2014 | 115 (39) | 