In [None]:
!pip install torch==2.2.0 torchtext==0.16.2 numpy\<2.0
!pip install portalocker>=2.0.0
!pip install sacrebleu

In [2]:
import torch
import torchtext
import numpy
import portalocker
import torchdata
import math


from torch import nn
from torch import Tensor
from torch.nn import Transformer
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import SentencePieceTokenizer

## Tokenisation (BPE) / Construction du vocabulaire :

In [4]:
token_transform[SRC_LANGUAGE]("The tokenizer is preprocessing the text.")

['▁The',
 '▁to',
 'ken',
 'iz',
 'er',
 '▁is',
 '▁prep',
 'ro',
 'cess',
 'ing',
 '▁the',
 '▁text',
 '.']

## Création du dataloader

In [5]:
from torch.nn.utils.rnn import pad_sequence

# Fonction Helper qui applique une suite de transformation
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func


# Ajoute BOS/EOS au token et créer le tenseur
def add_bos_and_eos(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

#Tokenisation complète
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               add_bos_and_eos) #ajoute BOS/EOS et créer le tenseur


#batch utilisable
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

Loading

In [6]:
from torch.utils.data import DataLoader
BATCH_SIZE = 128
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
size_dataloader = len(list(train_dataloader))
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

## MODELE

PositionnalEncoding sinusoidale classique:

In [7]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        sum_embed = token_embedding + self.pos_embedding[:token_embedding.size(0), :]
        return self.dropout(sum_embed)

Creation des masques d'attention

In [8]:
def generate_square_subsequent_mask(mask_size):
    return torch.triu(torch.ones((mask_size, mask_size), device=DEVICE, dtype=torch.bool),diagonal=1)


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).to(DEVICE)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

Modele

In [9]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        outs = self.embedding(tokens.long())
        return outs  * math.sqrt(self.emb_size)

In [10]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.unembed = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        #embedding + positional encoding
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        #Transformer complet (encodeur + décodeur)
        outs = self.transformer(src_emb,tgt_emb,src_mask=src_mask,tgt_mask=tgt_mask,
                                src_key_padding_mask=src_padding_mask,tgt_key_padding_mask=tgt_padding_mask,
                                memory_key_padding_mask=memory_key_padding_mask)
        #projections
        logits = self.unembed(outs)
        return logits



    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [11]:
torch.manual_seed(0)

#hyperparameters
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 2048
NUM_ENCODER_LAYERS = 4
NUM_DECODER_LAYERS = 4
lr=0.0003
NUM_EPOCHS=10

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

#Initialisation des poids du transformer par xavier_uniform (évite les vanishing gradient)
for p in transformer.parameters():
    if p.dim() > 1: #pas les biais
        nn.init.xavier_uniform_(p)
transformer = transformer.to(DEVICE)

#loss et optimizer
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0003, betas=(0.9, 0.98), eps=1e-9)



## Training

In [12]:
def train_epoch(model, optimizer, loss_fn, train_dataloader):
    model.train()
    train_losses = []
    total_loss = 0.0

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        # Décalage
        tgt_input = tgt[:-1, :]      # On enlève EOS : ce que le decodeur voit
        tgt_output = tgt[1:, :]      # On enlève BOS : ce que le decodeur doit prédire

        # Création des masks avec la fonction précédente
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        # Forward
        logits = model(src,tgt_input,src_mask,tgt_mask,src_padding_mask,tgt_padding_mask,src_padding_mask)

        # Loss
        optimizer.zero_grad() #remettre à 0 les gradients
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]),tgt_output.reshape(-1))

        # Backward
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        total_loss += loss.item()

    return total_loss / len(train_losses), train_losses


def evaluate(model, loss_fn, val_dataloader):
    model.eval()
    val_losses = []
    total_loss = 0.0
    with torch.no_grad():
      for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        # Décalage
        tgt_input = tgt[:-1, :] # On enlève EOS : ce que le decodeur voit
        tgt_output = tgt[1:, :] # On enlève BOS : ce que le decodeur doit prédire

        # Création des masks avec la fonction précédente
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src,tgt_input,src_mask,tgt_mask,src_padding_mask,tgt_padding_mask,src_padding_mask)

        #loss
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]),tgt_output.reshape(-1))
        val_losses.append(loss.item())
        total_loss += loss.item()


      return total_loss / len(val_losses)

In [13]:
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss, _ = train_epoch(transformer, optimizer, loss_fn, train_dataloader)
    val_loss = evaluate(transformer, loss_fn, val_dataloader)
    print(f"Epoch {epoch} | Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f}")



Epoch 1 | Train loss: 5.0683 | Val loss: 4.3053
Epoch 2 | Train loss: 3.9511 | Val loss: 3.7779
Epoch 3 | Train loss: 3.4936 | Val loss: 3.4729
Epoch 4 | Train loss: 3.0142 | Val loss: 2.9634
Epoch 5 | Train loss: 2.4868 | Val loss: 2.5166
Epoch 6 | Train loss: 2.0671 | Val loss: 2.2886
Epoch 7 | Train loss: 1.7717 | Val loss: 2.1497
Epoch 8 | Train loss: 1.5397 | Val loss: 2.0597
Epoch 9 | Train loss: 1.3470 | Val loss: 2.0368
Epoch 10 | Train loss: 1.1930 | Val loss: 2.0381


## INFERENCE

BEAM SEARCH pour la traduction

In [14]:
import torch.nn.functional as F
def beam_search_decode(model, src, src_mask, max_len, start_symbol, K=5,length_penalty=0.7):
    model.eval()
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    with torch.no_grad():
        memory = model.encode(src, src_mask)
        init_seq = torch.full((1, src.shape[1]), start_symbol, dtype=torch.long, device=DEVICE)
        beams = [(init_seq, 0.0, False)]

        for _ in range(max_len - 1):
            candidates = []

            #Si toutes les hypothèses du beam en cours ont déjà produit un token EOS on a fini d'étudier src
            if all(finished for (_, _, finished) in beams):
                break

            for seq, score, finished in beams: #on va etudier toutes les hypothèses du beam
                if finished:
                    #Si l'hypothèse a atteint EOS alors on l'ajoute à candidat et on passe à la prochaine hypothèse
                    candidates.append((seq, score, True))
                    continue

                #sinon on génère :
                tgt_mask = generate_square_subsequent_mask(seq.size(0)).to(DEVICE)
                out = model.decode(seq, memory, tgt_mask)
                logits = model.unembed(out)
                log_probs = F.log_softmax(logits[-1, 0, :], dim=-1)

                #On va regarder les K meilleurs prochains tokens et itérer dessus pour noter leur score respectif
                topk_log_probs, topk_tokens = torch.topk(log_probs, K)
                for j in range(K):
                    tok = topk_tokens[j].item()
                    new_seq = torch.cat([seq, torch.tensor([[tok]], dtype=torch.long, device=DEVICE)],dim=0)
                    new_score = score + topk_log_probs[j].item()
                    new_finished = (tok == EOS_IDX) #si le prochain token est EOS alors on a fini cette hypotèse
                    candidates.append((new_seq, new_score, new_finished))

            candidates.sort(key=lambda x: x[1] / (x[0].size(0) ** length_penalty), reverse=True) #length_penalty pour eviter que les séquences courtes soient privilégiées
            beams = candidates[:K] #on garde que les K meilleurs hypothèses (il y en a max K^2) comme ca on en a seulement K à chaque fois (sinon suite géometrique )

        #Selection du meilleur beam
        best_seq, _, _ = max(beams, key=lambda x: x[1] / (x[0].size(0) ** length_penalty))
        return best_seq

TRADUCTEUR : Anglais -> Allemand


In [15]:
def detokenize_sentencepiece(tokens):
    text = "".join(tokens)
    return text.replace("▁", " ").strip()


def Beam_translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)

    tgt_tokens = beam_search_decode(model,src,src_mask,max_len=num_tokens + 5,start_symbol=BOS_IDX,K=5,length_penalty=0.7).flatten()

    tokens = vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().tolist()))
    sentence = detokenize_sentencepiece(tokens)
    sentence = sentence.replace("<bos>", "").replace("<eos>", "").strip()
    return sentence


BLEU score :

In [26]:
import sacrebleu
from datasets import load_dataset

dataset = load_dataset("bentrevett/multi30k")

test_data = dataset["test"]

predictions = []
references = []

transformer.eval()

with torch.no_grad():
    for example in test_data:
        src_sentence = example["en"]
        tgt_sentence = example["de"]

        pred = Beam_translate(transformer, src_sentence)

        predictions.append(pred)
        references.append(tgt_sentence)

bleu = sacrebleu.corpus_bleu(
    predictions,
    [references],
    tokenize="13a"
)

print("BLEU score:", bleu.score)


README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

BLEU score: 33.00658512896013


déalement, j’aurais dû évaluer sur exactement la version TorchText utilisée pour l’entraînement afin de garantir une parfaite cohérence. Cependant, le split test de TorchText posait un problème de chargement et nécessitait de refaire tout le pipeline. J’ai donc utilisé la version HuggingFace, très proche en contenu, et le score BLEU obtenu reste cohérent avec les benchmarks attendus.

In [27]:
print(Beam_translate(transformer, "The woman is sleeping"))
print(Beam_translate(transformer, "The cat is playing with the dog."))
print(Beam_translate(transformer, "Two children are playing in the snow."))
print(Beam_translate(transformer, "A man is riding a bicycle down the street."))
print(Beam_translate(transformer, "Several people are sitting at a table outside."))
print(Beam_translate(transformer, "A dog is running through a field."))
print(Beam_translate(transformer, "A woman is holding a baby in her arms."))
print(Beam_translate(transformer, "Three men are standing near a car."))
print(Beam_translate(transformer, "A child is eating ice cream."))
print(Beam_translate(transformer, "I'm hungry, I want to eat"))

Eine Frau schläft auf dem Bett.
Die Katze spielt mit dem Hund.
Zwei Kinder spielen im Schnee.
Ein Mann fährt auf einem Fahrrad die Straße entlang.
Mehrere Personen sitzen draußen an einem Tisch.
Ein Hund läuft über ein Feld.
Eine Frau hält ein Baby in ihren Armen.
Drei Männer stehen in der Nähe eines Autos.
Ein Kind isst Eiscreme.
Ich vergnügen sich, um zu essen.


A woman is sleeping on the bed. (mots en plus)

A cat is playing with the dog.

Two children are playing in the snow.

A man is riding a bicycle down the street.

Several people are sitting outside at a table.

A dog is running across a field.

A woman is holding a baby in her arms.

Three men are standing near a car.

A child is eating an ice cream.

They are having fun to eat.(complétement faux quand on sort du type description d'image)

Conclusion :

Dans l’ensemble, le modèle produit des traductions grammaticalement correctes et sémantiquement cohérentes pour des phrases simples de description. On observe toutefois une forte homogénéité dans les structures et le vocabulaire, qui reflète directement le biais du jeu de données utilisé (phrases descriptives, scènes neutres, langage standardisé, issu de descriptions d’images).

Ce biais limite la diversité lexicale et la capacité de généralisation du modèle à des phrases plus complexes, abstraites ou hors domaine. Les résultats sont donc satisfaisants dans le cadre restreint du dataset, mais ils ne garantissent pas encore des performances robustes en conditions réelles ou sur des textes plus riches et variés.