In [2]:
!pip install git+https://github.com/huggingface/transformers
!pip install qwen-vl-utils

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-vnh42rnh
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-vnh42rnh
  Resolved https://github.com/huggingface/transformers to commit 40cba20e8781e2ac5936fca081a88493c3ce8b43
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.52.0.dev0-py3-none-any.whl size=11396420 sha256=0db1bfed36d3265ad4c1689878fef664637db5cfe284b99e4e4c81e2dac1634c
  Stored in directory: /tmp/pip-ephem-wheel-cache-6hh9n04_/wheels/04/a3/f1/b88775f8e1665827525b19ac7590250f1038d947067beba9fb
Successfully built transformer

In [3]:
import huggingface_hub
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

In [4]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    # "Qwen/Qwen2.5-VL-32B-Instruct", torch_dtype="auto", device_map="cpu"
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="cuda"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.70k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [5]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

In [9]:
import os
import csv
from PIL import Image

image_folder = "/content/cropped_objects/content/cropped_objects"
output_csv = "ocr_results.csv"
supported_exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff")
ocr_results = []

resized_folder = "/tmp/resized_images"
os.makedirs(resized_folder, exist_ok=True)

for image_name in sorted(os.listdir(image_folder)):
    if not image_name.lower().endswith(supported_exts):
        continue

    original_path = os.path.join(image_folder, image_name)
    resized_path = os.path.join(resized_folder, image_name)

    # Resize image to max 720x720 while preserving aspect ratio
    with Image.open(original_path) as img:
        img = img.convert("RGB")
        img.thumbnail((720, 720), Image.Resampling.LANCZOS)
        img.save(resized_path)

    # Build messages for this single image
    messages = [
        {"role": "user",
         "content": [
            {"type": "image", "image": resized_path},
            {"type": "text", "text": "Please extract all the text from this image."},
        ]}
    ]

    # Format input
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # Optional: only do this if model is fully on GPU
    inputs = inputs.to('cuda')
    model = model.to("cuda")

    # Run inference
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]

    # Store result
    ocr_results.append((image_name, output_text.strip()))

# Print all results
#print(ocr_results)

# Write to CSV
with open(output_csv, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["image_filename", "extracted_text"])  # header
    writer.writerows(ocr_results)


