- Nom : Andriamaharo
- Prénoms : Kwon-Chui Rado Angelin
- Examen : RNN
- Projet : Traduction de Texte Francais-Anglais
- Niveau : M1-I2AD

A lancer sur GoogleCoolab

## Step 1 : Installation du pytorch

In [None]:
!pip install torch==2.0.0 torchtext==0.15.1

## Step 2 : Charger le dataset

In [None]:
import unicodedata
import string

# Initialisation des listes
train_english_sentences = []
train_french_sentences = []

# Fonction pour nettoyer les caractères Unicode spéciaux et la ponctuation
def clean_text(text):
    # Normalisation Unicode pour uniformiser les caractères spéciaux
    return unicodedata.normalize("NFKC", text).replace("\u202f", " ").strip()

# Fonction pour retirer la ponctuation (utilisée pendant l'entraînement)
def remove_punctuation(sentence):
    return sentence.strip(string.punctuation).strip()

# Lecture et traitement du fichier
with open("/content/fra.txt", "r", encoding="utf-8") as file:
    for line in file:
        # Séparation des éléments par tabulation
        parts = line.strip().split("\t")
        if len(parts) >= 2:  # S'assurer qu'il y a au moins deux colonnes
            # Nettoyage de base
            english = clean_text(parts[0])
            french = clean_text(parts[1])

            # Préparer les données d'entraînement sans ponctuation
            train_english_sentences.append(remove_punctuation(english))
            train_french_sentences.append(remove_punctuation(french))

# Affichage des résultats pour l'entraînement
print("Training English Sentences:", train_english_sentences[:10])  # Exemple des 10 premières phrases
print("Training French Sentences:", train_french_sentences[:10])   # Exemple des 10 premières phrases

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import random

# Tokenisation
def tokenize(sentence):
    return sentence.split()

# Construire les vocabulaires
def build_vocab(sentences):
    vocab = {word: i + 3 for i, word in enumerate(set(" ".join(sentences).split()))}  # +3 pour les tokens spéciaux
    vocab["<pad>"] = 0
    vocab["<start>"] = 1
    vocab["<end>"] = 2
    return vocab

french_vocab = build_vocab(train_french_sentences[:50000])
english_vocab = build_vocab(train_english_sentences[:50000])
inv_english_vocab = {v: k for k, v in english_vocab.items()}

# Encodage des phrases
def encode_sentence(sentence, vocab):
    tokens = ["<start>"] + tokenize(sentence) + ["<end>"]
    return [vocab.get(token, vocab["<pad>"]) for token in tokens]

encoded_french = [encode_sentence(s, french_vocab) for s in train_french_sentences[:50000]]
encoded_english = [encode_sentence(s, english_vocab) for s in train_english_sentences[:50000]]

# Dataset personnalisé
class TranslationDataset(Dataset):
    def __init__(self, source_data, target_data):
        self.source_data = source_data
        self.target_data = target_data

    def __len__(self):
        return len(self.source_data)

    def __getitem__(self, idx):
        return torch.tensor(self.source_data[idx]), torch.tensor(self.target_data[idx])

dataset = TranslationDataset(encoded_french, encoded_english)

# Collate function pour gérer le padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_batch, tgt_batch

# DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Modèle Seq2Seq avec LSTM
class Seq2Seq(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, hidden_size):
        super(Seq2Seq, self).__init__()
        self.embedding_enc = nn.Embedding(input_vocab_size, embedding_dim)
        self.embedding_dec = nn.Embedding(output_vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_vocab_size)

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        # Encoder
        embedded_src = self.embedding_enc(src)
        _, (hidden, cell) = self.encoder(embedded_src)

        # Decoder
        batch_size = tgt.size(0)
        seq_len = tgt.size(1)
        outputs = torch.zeros(batch_size, seq_len, len(english_vocab)).to(src.device)

        input_token = tgt[:, 0].unsqueeze(1)  # Premier token du décodeur (<start>)
        for t in range(1, seq_len):
            embedded_dec = self.embedding_dec(input_token)
            output, (hidden, cell) = self.decoder(embedded_dec, (hidden, cell))
            logits = self.fc(output)

            outputs[:, t, :] = logits.squeeze(1)

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = logits.argmax(2)
            input_token = tgt[:, t].unsqueeze(1) if teacher_force else top1

        return outputs

# Initialisation du modèle
input_vocab_size = len(french_vocab)
output_vocab_size = len(english_vocab)
embedding_dim = 16
hidden_size = 32
model = Seq2Seq(input_vocab_size, output_vocab_size, embedding_dim, hidden_size)

# Optimiseur et fonction de perte
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Entraînement
epochs = 50

def train():
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        teacher_forcing_ratio = max(0.5 - (epoch / epochs) * 0.5, 0.1)
        for src, tgt in dataloader:
            optimizer.zero_grad()
            output = model(src, tgt, teacher_forcing_ratio)

            # Supprimer le premier token (<start>) pour la perte
            output = output[:, 1:, :].reshape(-1, output_vocab_size)
            target = tgt[:, 1:].reshape(-1)

            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {epoch_loss:.4f}, Teacher Forcing Ratio: {teacher_forcing_ratio:.2f}")

# Traduction
def translate(sentence):
    model.eval()
    with torch.no_grad():
        src = torch.tensor(encode_sentence(sentence, french_vocab)).unsqueeze(0)
        embedded_src = model.embedding_enc(src)
        _, (hidden, cell) = model.encoder(embedded_src)

        outputs = []
        input_token = torch.tensor([[english_vocab["<start>"]]]).to(src.device)
        for _ in range(20):  # Longueur maximale pour la sortie
            embedded_dec = model.embedding_dec(input_token)
            output, (hidden, cell) = model.decoder(embedded_dec, (hidden, cell))
            logits = model.fc(output)
            top1 = logits.argmax(2).item()
            if top1 == english_vocab["<end>"]:
                break
            outputs.append(top1)
            input_token = torch.tensor([[top1]]).to(src.device)
        return " ".join([inv_english_vocab[idx] for idx in outputs])

## Step 3 : Entrainement

In [None]:
train()

## Step 4 : Faire le test

In [None]:
# Tester la traduction
value="bonjour, je suis libre"
print("la traduction de ", value, "est de :", translate(value))