In [1]:
!pip install transformers torchvision datasets --quiet
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import AdamW
from torchvision.models import resnet18
from torch import nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from torchvision import transforms
from torchmetrics.text import CharErrorRate, WordErrorRate
from tqdm import tqdm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━

2025-05-12 10:24:28.726767: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747045468.912809      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747045468.965934      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
from torchvision.models import resnet18
from torchmetrics.text import CharErrorRate, WordErrorRate
from PIL import Image
from sklearn.model_selection import train_test_split

# Dataset Class
class HandwritingDataset(Dataset):
    def __init__(self, images_dir, labels_dir, tokenizer, transform=None):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.tokenizer = tokenizer
        self.transform = transform or transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.ToTensor(),
        ])

        self.image_files = sorted([f for f in os.listdir(images_dir) 
                                   if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        self.label_files = sorted([f for f in os.listdir(labels_dir) 
                                   if f.lower().endswith('.txt')])

        assert len(self.image_files) == len(self.label_files), "Image/label count mismatch"
        for img, lbl in zip(self.image_files, self.label_files):
            assert os.path.splitext(img)[0] == os.path.splitext(lbl)[0], \
                f"Mismatched pair: {img} vs {lbl}"

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.images_dir, self.image_files[idx])
        image = Image.open(img_path).convert('RGB')

        lbl_path = os.path.join(self.labels_dir, self.label_files[idx])
        with open(lbl_path, 'r', encoding='windows-1256') as f:
            text = f.read().strip()

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',
            max_length=128,
            truncation=True
        )

        return {
            'pixel_values': image,
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'raw_text': text
        }

# Model with Frozen ResNet
class HandwritingGPT2(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = resnet18(pretrained=True)
        for param in self.cnn.parameters():
            param.requires_grad = False
        self.cnn.fc = nn.Linear(512, 768)
        self.gpt2 = GPT2LMHeadModel.from_pretrained("aubmindlab/aragpt2-base")

    def forward(self, pixel_values, input_ids=None, attention_mask=None, labels=None):
        features = self.cnn(pixel_values)
        if input_ids is not None:
            features = features.unsqueeze(1).expand(-1, input_ids.shape[1], -1)
        return self.gpt2(
            inputs_embeds=features,
            attention_mask=attention_mask,
            labels=labels
        )

# Initialize
tokenizer = GPT2Tokenizer.from_pretrained("aubmindlab/aragpt2-base")
tokenizer.pad_token = tokenizer.eos_token

dataset = HandwritingDataset(
    images_dir="/kaggle/input/khatt-arabic-hand-written-lines/images",
    labels_dir="/kaggle/input/khatt-arabic-hand-written-lines/labels",
    tokenizer=tokenizer
)

train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HandwritingGPT2().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
cer = CharErrorRate().to(device)
wer = WordErrorRate().to(device)

# Training Loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_cer, train_wer = [], []

    for batch in train_loader:
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values, input_ids, attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        with torch.no_grad():
            features = model.cnn(pixel_values).unsqueeze(1)
            generated = model.gpt2.generate(
                inputs_embeds=features,
                max_length=128,
                num_beams=5,
                early_stopping=True,
                pad_token_id=tokenizer.pad_token_id
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated]
            train_cer.append(cer(preds, batch['raw_text']))
            train_wer.append(wer(preds, batch['raw_text']))

    print(f"\n--- Epoch {epoch+1} Training ---")
    print(f"Avg Loss     : {total_loss / len(train_loader):.4f}")
    print(f"Training CER : {torch.stack(train_cer).mean().item():.4f}")
    print(f"Training WER : {torch.stack(train_wer).mean().item():.4f}")

    model.eval()
    val_loss, val_cer, val_wer = 0, [], []
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(pixel_values, input_ids, attention_mask, labels=input_ids)
            val_loss += outputs.loss.item()
            features = model.cnn(pixel_values).unsqueeze(1)
            generated = model.gpt2.generate(
                inputs_embeds=features,
                max_length=128,
                num_beams=5,
                early_stopping=True,
                pad_token_id=tokenizer.pad_token_id
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated]
            val_cer.append(cer(preds, batch['raw_text']))
            val_wer.append(wer(preds, batch['raw_text']))

    print(f"--- Epoch {epoch+1} Validation ---")
    print(f"Validation Loss : {val_loss / len(val_loader):.4f}")
    print(f"Validation CER  : {torch.stack(val_cer).mean().item():.4f}")
    print(f"Validation WER  : {torch.stack(val_wer).mean().item():.4f}")

vocab.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 190MB/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Epoch 1 Training ---
Avg Loss     : 1.2727
Training CER : 0.9510
Training WER : 0.9902
--- Epoch 1 Validation ---
Validation Loss : 4.7213
Validation CER  : 1.9752
Validation WER  : 1.4356

--- Epoch 2 Training ---
Avg Loss     : 0.9728
Training CER : 0.8729
Training WER : 0.9766
--- Epoch 2 Validation ---
Validation Loss : 3.4821
Validation CER  : 0.8455
Validation WER  : 1.0519

--- Epoch 3 Training ---
Avg Loss     : 0.9470
Training CER : 0.8502
Training WER : 0.9730
--- Epoch 3 Validation ---
Validation Loss : 3.0956
Validation CER  : 0.9739
Validation WER  : 1.4716

--- Epoch 4 Training ---
Avg Loss     : 0.9255
Training CER : 0.8183
Training WER : 0.9677
--- Epoch 4 Validation ---
Validation Loss : 3.2038
Validation CER  : 0.8163
Validation WER  : 1.1827

--- Epoch 5 Training ---
Avg Loss     : 0.9222
Training CER : 0.8247
Training WER : 0.9692
--- Epoch 5 Validation ---
Validation Loss : 1.4377
Validation CER  : 0.9148
Validation WER  : 1.2832

--- Epoch 6 Training ---
Avg 

In [3]:
# Sauvegarder uniquement les poids du modèle
torch.save(model.state_dict(), "handwriting_gpt2_resnet.pt")
#from transformers import GPT2Tokenizer
#tokenizer = GPT2Tokenizer.from_pretrained("aragpt2-base")


In [4]:
# Prediction Function
def predict(image_path, model, tokenizer):
    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        features = model.cnn(image).unsqueeze(1)
        generated = model.gpt2.generate(
            inputs_embeds=features,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )
    return tokenizer.decode(generated[0], skip_special_tokens=True)

# Test Prediction
test_img = "/kaggle/input/khatt-arabic-hand-written-lines/images/AHTD3A0001_Para1_4.jpg"
print("Predicted Text:", predict(test_img, model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Predicted Text:  أفهمض مثل بغ�ض الضابط الضابط لز له الضابطمتك سألت سألت راجح راجح راجح بلغ بلغ بلغ بلغ بلغ بلغ
