In [2]:
import torch
import torch.nn as nn

import math, time
import unicodedata, re

SOS_token ,START_token, END_token, PADDING_token = 3, 1, 2, 0
SOS,START, END, PADDING = "[SOS]","[START]", "[END]", "[PADDING]"

class CONFIG_class:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = 64         # Batch size = h , 64
        self.max_seq_length = 14
        self.src_vocab_size = 0
        self.tgt_vocab_size = 0
        
        # Hyperparameters for the Transformer model
        self.d_model = 256           # Embedding size for each word, 512
        self.num_heads = 8           # Number of attention heads, 8
        self.num_layers_encoder = 6  # Number of encoder layers, 6
        self.num_layers_decoder = 6  # Number of decoder layers, 6
        self.d_feedforward = 1024    # Dimension of the feedforward layer, 2048
        self.dropout = 0.1           # Dropout rate to prevent overfitting


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {PADDING: PADDING_token, START: START_token, END:END_token, SOS: SOS_token}
        self.index2word = {SOS_token: SOS, START_token: START, END_token: END, PADDING_token: PADDING}
        self.n_words = 4

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

# Helper Module that adds positional encoding to the token embedding
# to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self, config):
        super(PositionalEncoding, self).__init__()
        # Compute the positional encodings once in log space.
        
        den = torch.exp(-torch.arange(0, config.d_model, 2) * (math.log(10000) / config.d_model))
        pos = torch.arange(config.max_seq_length).unsqueeze(1)  # shape: (max_seq_length, 1)
        pos_embedding = torch.zeros(config.max_seq_length, config.d_model)
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)

        pos_embedding = pos_embedding.unsqueeze(0).repeat(config.batch_size, 1, 1)

        # Store the positional embedding in a buffer (a tensor that is not a parameter)
        self.register_buffer('pos_embedding', pos_embedding)
        
        # Define dropout layer to be applied to the embeddings
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, token_embedding):
        # Add positional encoding to token embeddings and apply dropout
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :token_embedding.size(1)])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, d_model: int):
        super(TokenEmbedding, self).__init__()
        
        # Create an embedding layer that maps each token index to an embedding vector
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.d_model)


# Seq2Seq Network using Transformer architecture
class Seq2SeqTransformer(nn.Module):
    def __init__(self, config):
        super(Seq2SeqTransformer, self).__init__()
        
        # Define the Transformer model with encoder and decoder layers
        self.transformer = nn.Transformer(d_model=config.d_model, nhead=config.num_heads, num_encoder_layers=config.num_layers_encoder, 
                                          num_decoder_layers=config.num_layers_decoder, dim_feedforward=config.d_feedforward, dropout=config.dropout,
                                          activation = "gelu", norm_first = False, batch_first = True, device = config.device)
        
        # Linear layer to project the transformer output to the target vocabulary size
        self.generator = nn.Linear(config.d_model, config.tgt_vocab_size)
        
        # Token embedding layers for source and target sequences
        self.src_token_emb = TokenEmbedding(config.src_vocab_size, config.d_model)
        self.tgt_token_emb = TokenEmbedding(config.tgt_vocab_size, config.d_model)
        
        # Positional encoding layer to add positional information to embeddings
        self.positional_encoding = PositionalEncoding(config)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):

        src_emb = self.positional_encoding(self.src_token_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_token_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        #forward(src, tgt, src_mask, tgt_mask, memory_mask, src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src, src_mask):
        # Encode the source sequence using the Transformer encoder
        return self.transformer.encoder(self.positional_encoding(self.src_token_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        # Decode the target sequence using the Transformer decoder with memory from the encoder
        return self.transformer.decoder(self.positional_encoding(self.tgt_token_emb(tgt)), memory, tgt_mask)

class Translator:
    def __init__(self, model, input_lang, output_lang, config):
        self.model = model
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.config = config
        
    def save(self, path):
        torch.save({
            'model_state_dict': self.model.state_dict(),  # Enregistrer uniquement les poids du modèle
            'input_lang': self.input_lang,
            'output_lang': self.output_lang,
            'config': self.config
        }, path)
    
    @classmethod
    def load(cls, path):
        checkpoint = torch.load(path, weights_only=False)
        input_lang = checkpoint['input_lang']
        output_lang = checkpoint['output_lang']
        config = checkpoint['config']
        model = Seq2SeqTransformer(config)
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.to(config.device)
        return cls(model, input_lang, output_lang, config)


    def evaluate(self, src_sentence):
        self.model.eval()
        src = tensorFromSentence(self.input_lang, normalizeString(src_sentence))
        num_tokens = src.shape[1]
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
        
        src = src.to(self.config.device)
        src_mask = src_mask.to(self.config.device)
    
        memory = self.model.encode(src, src_mask)
        ys = torch.ones(1, 1).fill_(START_token).type(torch.long).to(self.config.device)
        
        for i in range(self.config.max_seq_length):
            memory = memory.to(self.config.device)
            tgt_mask = torch.triu(torch.ones(ys.size(1), ys.size(1), dtype=torch.bool, device=self.config.device), diagonal=1)
            out = self.model.decode(ys, memory, tgt_mask)
            prob = self.model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.item()
            ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
            if next_word == END_token:
                break
                
        tgt_tokens = ys.flatten()
        return " ".join([self.output_lang.index2word[token.item()] for token in tgt_tokens])
        

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = s.lower().strip()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return START + " " + s.strip() + " " + END
    
def indexesFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    return indexes

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

In [44]:
loaded_translator1 = Translator.load('translator_model_eng_fra.pth')

In [45]:
src_sentence = "Hi"
translation = loaded_translator1.evaluate(src_sentence)
print("Translation:", translation)

Translation: [START] salut ! [END]


In [48]:
loaded_translator2 = Translator.load('translator_model_fra_eng2.pth')

In [51]:
src_sentence = "j'aime"
translation = loaded_translator2.evaluate(src_sentence)
print("Translation:", translation)

Translation: [START] i like it [END]


In [4]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Téléchargez les données nécessaires pour nltk
nltk.download('punkt')

# Traduction de référence (la bonne traduction humaine)
reference = [['this', 'is', 'a', 'small', 'test']]

# Traduction générée par la machine
candidate = ['this', 'is' ,'test']

# Calcul du score BLEU avec une méthode de lissage
smoothie = SmoothingFunction().method4
score = sentence_bleu(reference, candidate, smoothing_function=smoothie)

print(f"Score BLEU : {score:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emett\AppData\Roaming\nltk_data...


Score BLEU : 0.1203


[nltk_data]   Package punkt is already up-to-date!
