In [None]:
import os
import torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

In [None]:
from transformers import pipeline
import torch
# Loads the model 
pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="cuda",
    torch_dtype=torch.bfloat16
)


In [None]:
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from PIL import Image
import requests
import torch

model_id = "google/gemma-3-4b-it"

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id, device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)


In [None]:
import json
# Path to the directory (library)
directory_path = "/home/jovyan/images"
# List all files in the directory
files = os.listdir(directory_path)

# Filter out directories and show only files
files = [file for file in files if os.path.isfile(os.path.join(directory_path, file))]

data = {}

# Print the list of files
for index, file in enumerate(files):
    # Optimize memory usage 
    torch.cuda.empty_cache()
    print(f"Processing image {file}")
    
    image_path = os.path.join(directory_path, file)
    # Prompt for the Google's Gemma 
    prompt = f"""
    TASK : Object Character Recognition by Region
    Detect all text in the image.
    Seperate the texts by its individual regions.
    Do not include any header.
    Include only the text.
    """


    # Google's Gemma
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant that focuses on OCR tasks"}]
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"{image_path}" },
                {"type": "text", "text": f"{prompt}"}
            ]
        }
    ]
    inputs = processor.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=True,
        return_dict=True, return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    input_len = inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        generation = generation[0][input_len:]

    decoded = processor.decode(generation, skip_special_tokens=True)
    # Converts all the text into lower caps.
    # To save the labels in a list 
    labels_list = decoded.split()
    
    data[file] = labels_list
    
    print(f"Successfully processed Image {file} ,{index+1}/{len(files)} Images Processed")
    

    
with open('GemmaOCR.json' , 'w') as json_file:
        json.dump(data, json_file, indent=4)

print(f"All {len(files)} have been processed.")