In [1]:
# Install required packages
# !pip install torch transformers datasets accelerate pillow pandas

import torch
from torch.utils.data import DataLoader
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset, DatasetDict
import pandas as pd
from PIL import Image
import os




  from pandas.core import (


In [10]:
# Set environment variable to avoid TensorFlow conflicts
os.environ["TRANSFORMERS_NO_TF"] = "1"

# 1. Load and prepare dataset
def load_dataset(csv_path="labels.csv", image_dir="dataset/images"):
    df = pd.read_csv(csv_path)
    dataset = Dataset.from_pandas(df)
    return dataset

dataset = load_dataset()

In [11]:
# 2. Initialize processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")


# Configure model
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.decoder.config.vocab_size
model.config.max_length = 64

# Fix generation config
model.generation_config.early_stopping = False  # Disable early stopping
model.generation_config.no_repeat_ngram_size = 3

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [13]:
# 3. preprocessing function
def preprocess(example):
    try:
        image_path = f"dataset/images/{example['filename']}"
        image = Image.open(image_path).convert("RGB")
        
        # Process image and ensure output is tensor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        # Remove batch dimension and convert to tensor
        pixel_values = pixel_values.squeeze(0)
        
        # Process text
        labels = processor.tokenizer(
            example["text"],
            return_tensors="pt",
            padding="max_length",
            max_length=model.config.max_length
        ).input_ids.squeeze(0)  # Remove batch dimension
        
        return {
            "pixel_values": pixel_values,
            "labels": labels
        }
    except Exception as e:
        print(f"Error processing {example['filename']}: {str(e)}")
        return None

# Apply preprocessing and filter out None values
dataset = dataset.map(preprocess).filter(lambda x: x is not None)

Map:   0%|          | 0/439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/439 [00:00<?, ? examples/s]

In [14]:
# 4. Split dataset
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['filename', 'text', 'pixel_values', 'labels'],
        num_rows: 395
    })
    test: Dataset({
        features: ['filename', 'text', 'pixel_values', 'labels'],
        num_rows: 44
    })
})


In [15]:
# 5. Corrected collate function
def collate_fn(batch):
    # Ensure all elements are tensors before stacking
    pixel_values = torch.stack([item["pixel_values"] if isinstance(item["pixel_values"], torch.Tensor) 
                              else torch.tensor(item["pixel_values"]) for item in batch])
    
    labels = torch.stack([item["labels"] if isinstance(item["labels"], torch.Tensor)
                         else torch.tensor(item["labels"]) for item in batch])
    
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

In [16]:
# 6. Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=200,
    num_train_epochs=5,
    save_steps=500,
    logging_steps=100,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)

In [17]:
# 7. Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=collate_fn,
    tokenizer=processor,
)

  trainer = Seq2SeqTrainer(


In [18]:
# 8. Start training
trainer.train()

Step,Training Loss,Validation Loss
200,0.4089,0.380458
400,0.1597,0.291571




TrainOutput(global_step=495, training_loss=0.36558131400984945, metrics={'train_runtime': 10489.506, 'train_samples_per_second': 0.188, 'train_steps_per_second': 0.047, 'total_flos': 1.4778632522170368e+18, 'train_loss': 0.36558131400984945, 'epoch': 5.0})

In [19]:
# 9. Save the model
trainer.save_model("./trocr-finetuned-final")
processor.save_pretrained("./trocr-finetuned-final")

[]

# Test model

In [20]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

# Load your fine-tuned model and processor
model_path = "./trocr-finetuned-final"
processor = TrOCRProcessor.from_pretrained(model_path)
model = VisionEncoderDecoderModel.from_pretrained(model_path)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [21]:
def predict_text_from_image(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    
    # Process the image
    pixel_values = processor(image, return_tensors="pt").pixel_values
    
    # Generate text
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

In [37]:
# Test on a single image
image_path = r"C:\Users\Eya\Documents\Esprit\PI\bulletin_divise\dossier_5\trocr\dataset\images\Jacoubi chedly.png"
predicted_text = predict_text_from_image(image_path)
print(f"Predicted text: {predicted_text}")

Predicted text: Jacbihed
