In [1]:
!pip install --upgrade datasets fsspec huggingface_hub

Collecting fsspec
  Using cached fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)


In [2]:
from datasets import load_dataset

ds = load_dataset("Teklia/IAM-line")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from datasets import DatasetDict

# Check dataset keys
print(ds)

# Print the first example from the train split
print(ds['train'][0])


DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 6482
    })
    validation: Dataset({
        features: ['image', 'text'],
        num_rows: 976
    })
    test: Dataset({
        features: ['image', 'text'],
        num_rows: 2915
    })
})
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=2467x128 at 0x7C66F913E150>, 'text': 'put down a resolution on the subject'}


In [4]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
def preprocess(example):
    # Convert image to RGB
    image = example["image"].convert("RGB")

    # Processor automatically resizes & normalizes
    pixel_values = processor(images=image, return_tensors="pt").pixel_values[0]

    # Tokenize labels
    labels = processor.tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    ).input_ids

    # Replace padding token IDs with -100 (ignored in loss)
    labels = [label if label != processor.tokenizer.pad_token_id else -100 for label in labels]

    return {"pixel_values": pixel_values, "labels": labels}


In [6]:
ds = ds.map(preprocess, remove_columns=ds["train"].column_names)


In [7]:
from torch.utils.data import Dataset

class IAMDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "pixel_values": item["pixel_values"],
            "labels": item["labels"]
        }


In [8]:
from torch.utils.data import DataLoader

train_dataset = IAMDataset(ds["train"])
val_dataset = IAMDataset(ds["validation"])

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)


In [9]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.decoder.config.vocab_size


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [10]:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()


  scaler = GradScaler()


In [11]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [12]:
def collate_fn(batch):
    pixel_values = torch.stack([x["pixel_values"] for x in batch])
    labels = [torch.tensor(x["labels"]) for x in batch]

    # Pad labels manually (important for text)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    return {"pixel_values": pixel_values, "labels": labels}


In [13]:
# import torch
# from tqdm import tqdm
# from torch.utils.data import DataLoader

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Define the collate_fn before creating DataLoaders
# def collate_fn(batch):
#     pixel_values = torch.stack([x["pixel_values"] for x in batch])
#     labels = [torch.tensor(x["labels"]) for x in batch]

#     # Pad labels manually (important for text)
#     labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

#     return {"pixel_values": pixel_values, "labels": labels}

# # Update DataLoader creation to include collate_fn
# train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
# val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn)


# for epoch in range(10):  # You can later add early stopping if needed
#     model.train()
#     loop = tqdm(train_dataloader, leave=True)
#     total_loss = 0

#     for batch in loop:
#         # After applying collate_fn, batch["pixel_values"] will be a tensor
#         pixel_values = batch["pixel_values"].to(device)
#         # After applying collate_fn, batch["labels"] will be a tensor
#         labels = batch["labels"].to(device) # labels is already a tensor from collate_fn

#         optimizer.zero_grad()
#         with autocast():
#             outputs = model(pixel_values=pixel_values, labels=labels)
#             loss = outputs.loss

#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()

#         loop.set_description(f"Epoch {epoch}")
#         loop.set_postfix(loss=loss.item())
#         total_loss += loss.item()

#     avg_loss = total_loss / len(train_dataloader)
#     print(f"Epoch {epoch} Avg Loss: {avg_loss:.4f}")

In [14]:
# import torch
# from tqdm import tqdm
# from torch.utils.data import DataLoader

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Define the collate_fn before creating DataLoaders
# def collate_fn(batch):
#     # Ensure each item's pixel_values is treated as a tensor before stacking
#     pixel_values = torch.stack([torch.as_tensor(x["pixel_values"]) for x in batch])
#     labels = [torch.tensor(x["labels"]) for x in batch]

#     # Pad labels manually (important for text)
#     # Ensure padding_value is a tensor
#     labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)


#     return {"pixel_values": pixel_values, "labels": labels}

# # Update DataLoader creation to include collate_fn
# train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
# val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn)


# for epoch in range(1):  # You can later add early stopping if needed
#     model.train()
#     loop = tqdm(train_dataloader, leave=True)
#     total_loss = 0

#     for batch in loop:
#         # After applying collate_fn, batch["pixel_values"] will be a tensor
#         pixel_values = batch["pixel_values"].to(device)
#         # After applying collate_fn, batch["labels"] will be a tensor
#         labels = batch["labels"].to(device) # labels is already a tensor from collate_fn

#         optimizer.zero_grad()
#         with autocast():
#             outputs = model(pixel_values=pixel_values, labels=labels)
#             loss = outputs.loss

#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()

#         loop.set_description(f"Epoch {epoch}")
#         loop.set_postfix(loss=loss.item())
#         total_loss += loss.item()

#     avg_loss = total_loss / len(train_dataloader)
#     print(f"Epoch {epoch} Avg Loss: {avg_loss:.4f}")

In [15]:
# import torch
# from tqdm import tqdm
# from torch.utils.data import DataLoader

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Define the collate_fn before creating DataLoaders
# def collate_fn(batch):
#     # Ensure each item's pixel_values is treated as a tensor before stacking
#     pixel_values = torch.stack([torch.as_tensor(x["pixel_values"]) for x in batch])
#     labels = [torch.tensor(x["labels"]) for x in batch]

#     # Pad labels manually (important for text)
#     # Ensure padding_value is a tensor
#     labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

#     return {"pixel_values": pixel_values, "labels": labels}

# # Update DataLoader creation to include collate_fn, num_workers, and pin_memory
# # Start with a small number of workers (e.g., 2 or 4) and increase it if it helps
# train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn, num_workers=4, pin_memory=True)
# val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn, num_workers=4, pin_memory=True)


# for epoch in range(1):  # You can later add early stopping if needed
#     model.train()
#     loop = tqdm(train_dataloader, leave=True)
#     total_loss = 0

#     for batch in loop:
#         # After applying collate_fn, batch["pixel_values"] will be a tensor
#         pixel_values = batch["pixel_values"].to(device)
#         # After applying collate_fn, batch["labels"] will be a tensor
#         labels = batch["labels"].to(device) # labels is already a tensor from collate_fn

#         optimizer.zero_grad()
#         with autocast():
#             outputs = model(pixel_values=pixel_values, labels=labels)
#             loss = outputs.loss

#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()

#         loop.set_description(f"Epoch {epoch}")
#         loop.set_postfix(loss=loss.item())
#         total_loss += loss.item()

#     avg_loss = total_loss / len(train_dataloader)
#     print(f"Epoch {epoch} Avg Loss: {avg_loss:.4f}")

In [17]:
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, Subset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define collate function
def collate_fn(batch):
    pixel_values = torch.stack([torch.as_tensor(x["pixel_values"]) for x in batch])
    labels = [torch.tensor(x["labels"]) for x in batch]

    labels = torch.nn.utils.rnn.pad_sequence(
        labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id
    )

    return {"pixel_values": pixel_values, "labels": labels}

# Subsets for debugging/speed
train_dataset_subset = Subset(train_dataset, range(10))
val_dataset_subset = Subset(val_dataset, range(3))  # Limit validation to 3 samples

# DataLoaders
train_dataloader = DataLoader(
    train_dataset_subset, batch_size=4, shuffle=True,
    collate_fn=collate_fn, num_workers=0, pin_memory=True
)
val_dataloader = DataLoader(
    val_dataset_subset, batch_size=1, shuffle=False,
    collate_fn=collate_fn, num_workers=0, pin_memory=True
)

# Training loop (1 epoch for quick test)
for epoch in range(1):
    model.train()
    loop = tqdm(train_dataloader, leave=True)
    total_loss = 0

    for batch in loop:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with autocast():
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch} Avg Loss: {avg_loss:.4f}")

# Evaluation loop
model.eval()
predictions = []
references = []

for batch in tqdm(val_dataloader):
    pixel_values = batch["pixel_values"].to(device)
    labels = batch["labels"]

    with torch.no_grad():
        generated_ids = model.generate(pixel_values.to(device))

    # Decode predictions
    pred_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    # Clean up labels: remove -100 and ensure they are valid token IDs
    cleaned_labels = []
    for label in labels:
        label = label[label != -100]  # Remove ignored indices
        label = label[label < processor.tokenizer.vocab_size]  # Avoid large integers
        cleaned_labels.append(label)

    label_texts = processor.batch_decode(cleaned_labels, skip_special_tokens=True)

    predictions.extend(pred_texts)
    references.extend(label_texts)

# Print results
for i, (pred, ref) in enumerate(zip(predictions, references)):
    print(f"\nSample {i+1}:")
    print(f"Prediction: {pred}")
    print(f"Reference : {ref}")


  with autocast():
Epoch 0: 100%|██████████| 3/3 [02:17<00:00, 45.84s/it, loss=5.41]


Epoch 0 Avg Loss: 6.5096


100%|██████████| 3/3 [00:57<00:00, 19.16s/it]


Sample 1:
Prediction: A K is a Mr Gakell M for Mr Gakell M for Mr Gakell M for Mr
Reference : It was a splendid interpretation of the

Sample 2:
Prediction: sympathetic to Mr Gakekekekekekekekekekekekekeke
Reference : sympathetic C O . Paul Daneman gave another

Sample 3:
Prediction: past of Mr Ga to Mr Gakell P to Mr Gakell Mr Gakell Mr Gakell from
Reference : part . The rest of the cast were well chosen ,





In [23]:
!pip install evaluate

import evaluate

# Load the CER metric
cer_metric = evaluate.load("cer")

# Compute CER
cer = cer_metric.compute(predictions=predictions, references=references)
print(f"Character Error Rate (CER): {cer:.4f}")


Character Error Rate (CER): 0.9766


In [24]:
!pip install evaluate

import evaluate

# Load the WER metric
wer_metric = evaluate.load("wer")

# Compute WER
wer = wer_metric.compute(predictions=predictions, references=references)
print(f"Word Error Rate (WER): {wer:.4f}")




Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Word Error Rate (WER): 1.5000
