In [2]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
import os
from torchvision import transforms

# Load pretrained model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> i

In [10]:
# Define transformation for resizing and normalizing images
transform = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize images to fit model input
    transforms.ToTensor(),  
])

# Path to your dataset folder
image_folder = "file1_ipmages"

# Collect all image paths
image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith((".jpg", ".png"))]

# Load and process images
images = [transform(Image.open(img).convert("RGB")) for img in image_paths]

# Convert list of tensors into a batch
pixel_values = torch.stack(images)

# Ensure correct batch shape
print(pixel_values.shape)  # Should be (batch_size, 3, 384, 384)


torch.Size([6, 3, 384, 384])


In [18]:
# Path to the text folder
text_folder = "text"

# Load ground truth text for the first 3 images
labels = []
for img_path in image_paths[:3]:  # Only process the first 3 images
    img_name = os.path.basename(img_path).split(".")[0]  # Extract image name
    text_file = os.path.join(text_folder, f"{img_name}.txt")  # Corresponding text file

    if os.path.exists(text_file):
        with open(text_file, "r", encoding="utf-8") as f:
            labels.append(" ".join([line.strip() for line in f.readlines()]))  # Merge lines
    else:
        labels.append("")  # If missing, keep empty

# Tokenize text labels
label_inputs = processor.tokenizer(labels, padding=True, return_tensors="pt").input_ids
print(label_inputs.shape)

torch.Size([3, 2])


In [22]:
# Move model and data to device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
pixel_values = pixel_values[:3].to(device)  # Only first 3 images

# Generate predictions
with torch.no_grad():
    generated_ids = model.generate(pixel_values)

# Decode the predicted token IDs into text
predicted_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

# Print results
for i, pred in enumerate(predicted_texts):
    print(f"Image {i+1} Prediction: {pred}")


Image 1 Prediction: 25 July
Image 2 Prediction: 0 0
Image 3 Prediction: 0 0
