In [1]:
import json
import os
from PIL import Image
import torch
from collections import Counter
from torch.utils.data import Dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from transformers import Trainer
from sklearn.model_selection import train_test_split
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, TrainingArguments, Trainer
from collections import Counter
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class TrOCRDataset(Dataset):
    def __init__(self, json_file, processor):
        # Load annotations
        with open(json_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        image_path = entry['filepath']
        text = entry['text']

        # Load image
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
        image = Image.open(image_path).convert("RGB")

        # Process image (no padding/max_length for image processor)
        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values

        # Process text
        labels = self.processor.tokenizer(
            text, 
            return_tensors="pt", 
            padding="max_length", 
            max_length=128, 
            truncation=True
        ).input_ids

        # Remove batch dimension
        encoding = {"pixel_values": pixel_values.squeeze(0), "labels": labels.squeeze(0)}

        return encoding

In [5]:
# Load processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")

# Initialize dataset
json_file = r"C:\Users\hssin\Downloads\jsoncropped\annotation2.json"
dataset = TrOCRDataset(json_file, processor)

# Check dataset size
print(f"Dataset size: {len(dataset)}")

# Test one sample
sample = dataset[0]
print(f"Sample keys: {sample.keys()}")
print(f"Image tensor shape: {sample['pixel_values'].shape}")
print(f"Label IDs shape: {sample['labels'].shape}")

Dataset size: 694
Sample keys: dict_keys(['pixel_values', 'labels'])
Image tensor shape: torch.Size([3, 384, 384])
Label IDs shape: torch.Size([128])


In [7]:
# Load annotations
json_file = r"C:\Users\hssin\Downloads\jsoncropped\annotation2.json"
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Split into train and validation
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Verify sizes
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

Training set size: 555
Validation set size: 139


In [9]:
from collections import Counter
train_filepaths = [entry['filepath'] for entry in train_data]
val_filepaths = [entry['filepath'] for entry in val_data]
print(f"Train duplicates: {sum(1 for fp, count in Counter(train_filepaths).items() if count > 1)}")
print(f"Val duplicates: {sum(1 for fp, count in Counter(val_filepaths).items() if count > 1)}")

Train duplicates: 0
Val duplicates: 0


In [11]:
# Load processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Set model config
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = processor.tokenizer.vocab_size

# Create datasets
train_dataset = TrOCRDataset(json_file, processor)  # Temporarily use full dataset
val_dataset = TrOCRDataset(json_file, processor)    # Will fix in Step 5

# Override with split data
train_dataset.data = train_data
val_dataset.data = val_data

# Verify
print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

Train dataset size: 555
Val dataset size: 139


In [15]:
from collections import Counter
import evaluate
def compute_metrics(eval_pred):
    cer_metric = evaluate.load("cer")
    predictions, labels = eval_pred

    pred_ids = predictions.argmax(-1)
    label_ids = labels
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_texts = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_texts, references=label_texts)
    return {"cer": cer}

In [None]:
from transformers import TrainingArguments
# Training arguments
training_args = TrainingArguments(
    output_dir="./trocr_finetuned_temp",  # Temporary directory, won’t save model
    num_train_epochs=3,                   # Start with 3 epochs to test
    per_device_train_batch_size=2,        # Small batch size for trocr-large
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",          # Evaluate after each epoch
    logging_dir="./logs",
    logging_steps=100,                    # Log loss every 100 steps
    load_best_model_at_end=False,         # Don’t save checkpoints yet
    dataloader_num_workers=0,             # Set to 2-4 if on Linux with GPU
    report_to="all",                      # Ensure metrics are logged to console
)

# Custom data collator
def data_collator(features):
    return {
        'pixel_values': torch.stack([f['pixel_values'] for f in features]),
        'labels': torch.stack([f['labels'] for f in features]),
    }

# Initialize Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Add custom metrics
)

# Train and log metrics
trainer.train()

In [3]:
import json
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate
from collections import Counter

In [29]:
# Custom Dataset
class TrOCRDataset(Dataset):
    def __init__(self, data, processor, image_dir=""):
        self.data = data
        self.processor = processor
        self.image_dir = image_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        image_path = os.path.join(self.image_dir, entry['filepath']) if self.image_dir else entry['filepath']
        text = entry['text']

        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
        image = Image.open(image_path).convert("RGB")

        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(
            text, 
            padding="max_length", 
            max_length=128, 
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze(0)

        return {"pixel_values": pixel_values, "labels": labels}




In [31]:
# Load processor and model (use trocr-base for CPU)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Set model config
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = processor.tokenizer.vocab_size

# Load and split data
json_file = r"C:\Users\hssin\Downloads\jsoncropped\annotation2.json"
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

# Check for duplicates
train_filepaths = [entry['filepath'] for entry in train_data]
val_filepaths = [entry['filepath'] for entry in val_data]
print(f"Train duplicates: {sum(1 for _, count in Counter(train_filepaths).items() if count > 1)}")
print(f"Val duplicates: {sum(1 for _, count in Counter(val_filepaths).items() if count > 1)}")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Training set size: 80
Validation set size: 20
Train duplicates: 0
Val duplicates: 0


In [21]:
# Create datasets
train_dataset = TrOCRDataset(train_data, processor)
val_dataset = TrOCRDataset(val_data, processor)

# CER Metric
cer_metric = evaluate.load("cer")

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Extract logits from tuple (predictions[0] is logits)
    logits = predictions[0] if isinstance(predictions, tuple) else predictions
    pred_ids = torch.tensor(logits).argmax(dim=-1)  # Convert to tensor if needed
    label_ids = labels
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_texts = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_texts, references=label_texts)
    return {"cer": cer}

# Data collator
def data_collator(features):
    pixel_values = torch.stack([f['pixel_values'] for f in features])
    labels = torch.stack([f['labels'] for f in features])
    return {'pixel_values': pixel_values, 'labels': labels}

In [25]:
# Training arguments optimized for CPU
training_args = TrainingArguments(
    output_dir="./trocr_finetuned",
    num_train_epochs=3,  # Fewer epochs to test on CPU
    per_device_train_batch_size=1,  # Minimal batch size
    per_device_eval_batch_size=1,
    eval_strategy="epoch",  # Updated from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,  # Lower CER is better
    logging_dir="./logs",
    logging_steps=20,  # Frequent logging for CPU
    dataloader_num_workers=0,  # No multiprocessing on Windows
    gradient_accumulation_steps=2,  # Effective batch size = 1 * 2 = 2
    save_total_limit=1,  # Save only best model
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

Epoch,Training Loss,Validation Loss


AttributeError: 'tuple' object has no attribute 'argmax'

In [None]:

# Test on a sample
sample = val_dataset[0]
image = Image.open(val_data[0]['filepath']).convert("RGB")
pixel_values = processor(image, return_tensors="pt").pixel_values.to(model.device)
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"Predicted text: {generated_text}")
print(f"True text: {val_data[0]['text']}")

In [3]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# Load the image
image_path = r"C:\Users\hssin\Downloads\dataset_split1\train\Ordonnance\0711--8483891--20230705_page_2.jpg"
image = Image.open(image_path).convert("RGB")

# Load processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")  # or "trocr-large-handwritten"
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocess image and generate text
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
generated_ids = model.generate(pixel_values)

# Decode output
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Recognized text:", generated_text)


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

Recognized text: 1953 .


In [37]:
import json
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate
from collections import Counter

# Custom Dataset
class TrOCRDataset(Dataset):
    def __init__(self, data, processor, image_dir=""):
        self.data = data
        self.processor = processor
        self.image_dir = image_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        image_path = os.path.join(self.image_dir, entry['filepath']) if self.image_dir else entry['filepath']
        text = entry['text']

        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
        image = Image.open(image_path).convert("RGB")

        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(
            text, 
            padding="max_length", 
            max_length=128, 
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze(0)

        return {"pixel_values": pixel_values, "labels": labels}

# Load processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Set model config
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = processor.tokenizer.vocab_size

# Load and split data
json_file = r"C:\Users\hssin\Downloads\jsoncropped\annotation2.json"
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Subsample for quick testing (use 100 images total)
data = data[:100]  # Comment this out to use full dataset later
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

# Check for duplicates
train_filepaths = [entry['filepath'] for entry in train_data]
val_filepaths = [entry['filepath'] for entry in val_data]
print(f"Train duplicates: {sum(1 for _, count in Counter(train_filepaths).items() if count > 1)}")
print(f"Val duplicates: {sum(1 for _, count in Counter(val_filepaths).items() if count > 1)}")

# Create datasets
train_dataset = TrOCRDataset(train_data, processor)
val_dataset = TrOCRDataset(val_data, processor)

# CER Metric
cer_metric = evaluate.load("cer")

# Fixed compute_metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Extract logits from tuple (predictions[0] is logits)
    logits = predictions[0] if isinstance(predictions, tuple) else predictions
    pred_ids = torch.tensor(logits).argmax(dim=-1)  # Convert to tensor if needed
    label_ids = labels
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_texts = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_texts, references=label_texts)
    return {"cer": cer}

# Data collator
def data_collator(features):
    pixel_values = torch.stack([f['pixel_values'] for f in features])
    labels = torch.stack([f['labels'] for f in features])
    return {'pixel_values': pixel_values, 'labels': labels}

# Training arguments optimized for CPU
training_args = TrainingArguments(
    output_dir="./trocr_finetuned",
    num_train_epochs=1,  # Single epoch for quick testing
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    logging_dir="./logs",
    logging_steps=10,  # More frequent for small dataset
    dataloader_num_workers=0,
    gradient_accumulation_steps=2,
    save_total_limit=1,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Training set size: 80
Validation set size: 20
Train duplicates: 0
Val duplicates: 0


Epoch,Training Loss,Validation Loss,Cer
1,0.6745,0.32926,0.845238


There were missing keys in the checkpoint model loaded: ['decoder.output_projection.weight'].


TrainOutput(global_step=40, training_loss=1.029883199930191, metrics={'train_runtime': 1403.5512, 'train_samples_per_second': 0.057, 'train_steps_per_second': 0.028, 'total_flos': 5.986281527967744e+16, 'train_loss': 1.029883199930191, 'epoch': 1.0})

In [69]:

# Save the model
#trainer.save_model("./trocr_finetuned_final")
#processor.save_pretrained("./trocr_finetuned_final")

# Test on a sample
sample = val_dataset[0]
image = Image.open(val_data[1]['filepath']).convert("RGB")
pixel_values = processor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"Predicted text: {generated_text}")
print(f"True text: {val_data[0]['text']}")

Predicted text: 26
True text: Ayar Abdlekrim


In [71]:
import json
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Your JSON
json_file = r"C:\Users\hssin\Downloads\jsoncropped\annotation2.json"

# Load JSON
with open(json_file, 'r') as f:
    data = json.load(f)

# First image
entry = data[0]
image_path = entry['filepath']
true_text = entry['text']
print(f"Image: {image_path}")
print(f"True: {true_text}")

# Load model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Load image
image = Image.open(image_path).convert("RGB")
pixel_values = processor(image, return_tensors="pt").pixel_values

# Predict
generated_ids = model.generate(pixel_values)
predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"Predicted: {predicted_text}")

Image: C:\Users\hssin\Downloads\mergcropp\R+pamramique.png
True: R+pamramique


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

Predicted: Repampearance .


In [13]:
import json
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate

# Your JSON
json_file = r"C:\Users\hssin\Downloads\jsoncropped\annotation2.json"

# Dataset class
class TrOCRDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        image = Image.open(entry['filepath']).convert("RGB")
        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(
            entry['text'], padding="max_length", max_length=128, truncation=True, return_tensors="pt"
        ).input_ids.squeeze(0)
        return {"pixel_values": pixel_values, "labels": labels}

# Load model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

# Load JSON
with open(json_file, 'r') as f:
    data = json.load(f)

# Use all images (~694)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
print(f"Train: {len(train_data)}, Val: {len(val_data)}")

# Create datasets
train_dataset = TrOCRDataset(train_data, processor)
val_dataset = TrOCRDataset(val_data, processor)

# CER metric
cer_metric = evaluate.load("cer")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    logits = predictions[0] if isinstance(predictions, tuple) else predictions
    pred_ids = torch.tensor(logits).argmax(dim=-1)
    label_ids = labels
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_texts = processor.batch_decode(label_ids, skip_special_tokens=True)
    return {"cer": cer_metric.compute(predictions=pred_texts, references=label_texts)}

def data_collator(features):
    pixel_values = torch.stack([f['pixel_values'] for f in features])
    labels = torch.stack([f['labels'] for f in features])
    return {'pixel_values': pixel_values, 'labels': labels}

# Training settings
training_args = TrainingArguments(
    output_dir=r"./trocr_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    logging_dir=r"./logs",
    logging_steps=10,
    dataloader_num_workers=0,
    gradient_accumulation_steps=2,
    save_total_limit=1,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

Train: 555, Val: 139


Epoch,Training Loss,Validation Loss,Cer
0,0.2632,0.292362,0.909708


SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "There is not enough space on the disk." })

In [None]:

# Save
trainer.save_model(r"C:\Users\hssin\trocr_finetuned_final")

# Show results
print("\nPredictions:")
for i in range(min(3, len(val_dataset))):
    image = Image.open(val_data[i]['filepath']).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"Sample {i+1}: Predicted: {predicted_text}, True: {val_data[i]['text']}")