# 🖋️ Projet Text to Handwriting – Notebook complet
Ce notebook prépare les données IAM et entraîne un modèle simple Text → Handwriting avec PyTorch.

## 📂 1. Importation des bibliothèques et vérification des dossiers

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim

# Chemins dataset IAM
data_dir = "IAM_dataset"
lines_file = os.path.join(data_dir, "ascii", "lines.txt")
lines_img_dir = os.path.join(data_dir, "lines")

# Vérification
assert os.path.exists(data_dir), "Le dossier IAM_dataset est introuvable"
assert os.path.exists(lines_file), "Le fichier lines.txt est introuvable"
assert os.path.exists(lines_img_dir), "Le dossier lines est introuvable"
print("✅ IAM dataset prêt")

## 🧾 2. Lecture et parsing du fichier lines.txt

In [None]:
# Lecture et parsing
lines_data = []
with open(lines_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if line.startswith('#') or not line.strip():
            continue
        parts = line.strip().split()
        if len(parts) >= 9:
            img_id, status = parts[0], parts[1]
            if status == 'ok':
                text = ' '.join(parts[8:]).replace('|', ' ')
                img_base = img_id.split('-')[0]
                sub_dir = '-'.join(img_id.split('-')[:2])
                img_name = f"{img_id}.png"
                img_path = os.path.join(lines_img_dir, img_base, sub_dir, img_name)
                if os.path.exists(img_path):
                    lines_data.append((img_path, text))

df_lines = pd.DataFrame(lines_data, columns=['image_path','text'])
print(f"Nombre de lignes valides : {len(df_lines)}")
print(df_lines.head())

## 🧹 3. Nettoyage et split train/test

In [None]:
# Nettoyage
df_lines = df_lines[df_lines['text'].str.strip() != '']
df_lines = df_lines[df_lines['text'].str.len() < 100]

# Split train/test
train_df, test_df = train_test_split(df_lines, test_size=0.1, random_state=42)
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

## 🔄 4. Transformations et Dataset PyTorch

In [None]:
# Transformations
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,128)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Dataset personnalisé
class HandwritingGenDataset(Dataset):
    def __init__(self, dataframe, char2idx, transform=None, max_len=100):
        self.dataframe = dataframe
        self.char2idx = char2idx
        self.transform = transform
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        text = self.dataframe.iloc[idx]['text']

        # Texte en indices
        seq = [self.char2idx[c] for c in text if c in self.char2idx]
        if len(seq) < self.max_len:
            seq += [0]*(self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]
        seq = torch.tensor(seq, dtype=torch.long)

        # Image
        image = Image.open(img_path).convert('L')
        if self.transform:
            image = self.transform(image)
        return seq, image

# Vocabulaire
all_text = ''.join(df_lines['text'].values)
vocab = sorted(list(set(all_text)))
vocab_size = len(vocab)
char2idx = {c:i for i,c in enumerate(vocab)}
idx2char = {i:c for i,c in enumerate(vocab)}
print(f"Vocab size: {vocab_size}")

## 📦 5. Créer DataLoaders

In [None]:
max_len = 100
train_dataset = HandwritingGenDataset(train_df, char2idx, transform=transform, max_len=max_len)
test_dataset  = HandwritingGenDataset(test_df, char2idx, transform=transform, max_len=max_len)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Vérification
for seqs, imgs in train_loader:
    print(f"Batch seqs shape: {seqs.shape}")
    print(f"Batch images shape: {imgs.shape}")
    break

## 🏗️ 6. Définir le modèle Text → Handwriting simple

In [None]:
class TextToHandwriting(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, img_channels=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 32*128)
        self.conv = nn.Sequential(
            nn.Conv2d(1,64,3,padding=1),
            nn.ReLU(),
            nn.Conv2d(64,img_channels,3,padding=1),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        h = h.squeeze(0)
        h = self.fc(h)
        h = h.view(-1,1,32,128)
        img = self.conv(h)
        return img

# Initialiser le modèle
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TextToHandwriting(vocab_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

## 🔥 7. Boucle d'entraînement simplifiée (1 epoch test)

In [None]:
for seqs, imgs in train_loader:
    seqs, imgs = seqs.to(device), imgs.to(device)
    optimizer.zero_grad()
    outputs = model(seqs)
    loss = criterion(outputs, imgs)
    loss.backward()
    optimizer.step()
    print(f"Loss: {loss.item():.4f}")
    break  # tester 1 batch seulement

✅ Ce notebook contient toutes les étapes :
1. Préparation des données IAM
2. Parsing et DataFrame
3. Split train/test
4. Dataset PyTorch + transformations
5. DataLoaders
6. Modèle Text → Handwriting
7. Boucle d'entraînement