# Imports

In [1]:
import torch
import os
import gc
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
import fitz 
from tqdm import tqdm
print(torch.__version__)


2.9.1+cu130


In [2]:
torch.version.cuda

'13.0'

[]

In [3]:
torch.cuda.is_available()

True

# Image extraction

In [8]:
def transform_to_image(company_name):
    """
    Convert PDF pages into PNG images with a progress bar.
    """
    pdf_path = f"../data/{company_name}.pdf"
    out_dir = f"../data/imgs_{company_name}"
    os.makedirs(out_dir, exist_ok=True)

    doc = fitz.open(pdf_path)

    # tqdm over the pages
    for i, page in enumerate(tqdm(doc, desc=f"Processing {company_name}", unit="page")):
        pix = page.get_pixmap(dpi=200)
        out_path = os.path.join(out_dir, f"page_{i+1}.png")
        pix.save(out_path)

    print(f"\nFinished: {len(doc)} pages saved to {out_dir}")

transform_to_image("nvidia")
transform_to_image("meta")
transform_to_image("google")


Processing nvidia: 100%|██████████| 40/40 [00:03<00:00, 13.28page/s]



Finished: 40 pages saved to ../data/imgs_nvidia


Processing meta: 100%|██████████| 74/74 [00:09<00:00,  7.88page/s]



Finished: 74 pages saved to ../data/imgs_meta


Processing google: 100%|██████████| 120/120 [00:09<00:00, 12.09page/s]


Finished: 120 pages saved to ../data/imgs_google





# OCR with nanonets-3B model

## Model loading

In [4]:
model_path = "nanonets/Nanonets-OCR2-3B"

model = AutoModelForImageTextToText.from_pretrained(
    model_path, 
    dtype="auto", 
    device_map="auto", 
    attn_implementation="flash_attention_2"
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## OCR function

In [None]:
def ocr_page_with_nanonets_s(
    image: Image.Image,
    model,
    processor,
    max_new_tokens=4096,
    user_prompt=None
):
    """
    OCR using Nanonets-OCR2-3B with correct generation slicing,
    memory cleanup, and proper processor usage.
    """

    prompt = (
        """Extract the text from the above document as if you were reading it naturally.
        Return the tables as html. Return the equations in LaTeX representation.
        If there is an image in the document and image caption is not present, add a small description inside <img></img>.
        Page numbers as <page_number></page_number>.
        For footers use <footer></footer>.
        Use ☐ and ☑ for checkboxes."""
        if user_prompt is None else user_prompt
    )

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [ {"type": "image", "image": image},
                                      {"type": "text", "text": prompt},],
        },
    ]

    # Build chat prompt
    chat = processor.apply_chat_template(messages,tokenize = False, add_generation_prompt=True)

    # Prepare inputs
    inputs = processor(text=chat, images=image, return_tensors="pt").to(model.device)

    # Run generation (no gradients)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )

    # Correct slicing: take tokens AFTER the prompt length
    generated = output_ids[:, inputs.input_ids.shape[1]:]

    # Decode
    text_output = processor.batch_decode(
        generated,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]

    # Memory cleanup
    gc.collect()
    torch.cuda.empty_cache()

    return text_output

## Text saving function

In [None]:
def append_to_txt(company, text):
    output_file = f"../data/{company}_ocr.txt"
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(text)


In [16]:
append_to_txt("nvidia", result)

In [12]:
image = Image.open("../data/imgs_nvidia/page_15.png")
result = ocr_page_with_nanonets_s(image, model, processor, max_new_tokens=15000)
print(result)

<header>People, Diversity, and Inclusion</header>

<img>A horizontal bar chart titled “FY25 Hiring Data*” showing the distribution of hires by age across global regions. The Americas segment (40.5 %) is the longest green bar, followed by EMEA (31.3 %), APAC (26.9 %), and India (18.7 %). A legend indicates that the first three bars represent the Americas, EMEA, and APAC regions respectively, while the fourth bar represents India.</img>

*Numbers do not equal 100% due to rounding. We have increased focus on diversity recruiting, resulting in an increase in global female hiring in each channel. For additional diversity metrics, please see our Sustainability Indicators.

<footer>NVIDIA Sustainability | People, Diversity, and Inclusion</footer>

<page_number>15</page_number>


# FY25 Hiring Data*

## By Age (Global)

- **20-30 Years:** 40.5%
- **31-50 Years:** 52.9%
- **51+ Years:** 6.5%
- **No Data:** 0.1%

## By Gender (Global)

- **Men:** 70.2%
- **Women:** 26.9%
- **No Data:** 2.3%
- **