In [4]:
import logging
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torch.nn as nn
import torchmetrics as tm
import torch.optim as optim
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import torch
from icecream import ic
import unicodedata
import string
from tqdm import tqdm
from pathlib import Path
from typing import List

import time
import re
from torch.utils.tensorboard import SummaryWriter


logging.basicConfig(level=logging.INFO)

FILE = "../data/en-fra.txt"

writer = SummaryWriter("/tmp/runs/tag-" + time.asctime())


def normalize(s):
    return re.sub(
        " +",
        " ",
        "".join(
            c if c in string.ascii_letters else " "
            for c in unicodedata.normalize("NFD", s.lower().strip())
            if c in string.ascii_letters + " " + string.punctuation
        ),
    ).strip()


class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """

    PAD = 0
    EOS = 1
    SOS = 2
    OOVID = 3

    def __init__(self, oov: bool):
        self.oov = oov
        self.id2word = ["PAD", "EOS", "SOS"]
        self.word2id = {
            "PAD": Vocabulary.PAD,
            "EOS": Vocabulary.EOS,
            "SOS": Vocabulary.SOS,
        }
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self, idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self, idx: List[int]):
        return [self.getword(i) for i in idx]


class TradDataset:
    def __init__(self, data, vocOrig, vocDest, adding=True, max_len=10):
        self.sentences = []
        for s in tqdm(data.split("\n")):
            if len(s) < 1:
                continue
            orig, dest = map(normalize, s.split("\t")[:2])
            if len(orig) > max_len:
                continue
            self.sentences.append(
                (
                    torch.tensor(
                        [vocOrig.get(o) for o in orig.split(" ")] + [Vocabulary.EOS]
                    ),
                    torch.tensor(
                        [vocDest.get(o) for o in dest.split(" ")] + [Vocabulary.EOS]
                    ),
                )
            )

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, i):
        return self.sentences[i]


def collate_fn(batch):
    orig, dest = zip(*batch)
    o_len = torch.tensor([len(o) for o in orig])
    d_len = torch.tensor([len(d) for d in dest])
    return pad_sequence(orig), o_len, pad_sequence(dest), d_len


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open(FILE) as f:
    lines = f.readlines()

lines = [lines[x] for x in torch.randperm(len(lines))]
idxTrain = int(0.8 * len(lines))

vocEng = Vocabulary(True)
vocFra = Vocabulary(True)
MAX_LEN = 100
BATCH_SIZE = 32

datatrain = TradDataset("".join(lines[:idxTrain]), vocEng, vocFra, max_len=MAX_LEN)
datatest = TradDataset("".join(lines[idxTrain:]), vocEng, vocFra, max_len=MAX_LEN)

train_loader = DataLoader(
    datatrain, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True
)
test_loader = DataLoader(
    datatest, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=False
)

#  TODO:  Implémenter l'encodeur, le décodeur et la boucle d'apprentissage

100%|██████████| 136521/136521 [00:05<00:00, 24558.34it/s]
100%|██████████| 34132/34132 [00:01<00:00, 26735.15it/s]


In [2]:
for x_1, y_1, x_2, y_2 in train_loader:
    print(x_1[:, 0])
    print(vocEng.getwords(x_1[:, 0]))
    print(y_1)
    print(vocFra.getwords(x_2[:, 0]))
    break

tensor([ 15,  99, 241,  75, 255, 227, 777,   9, 163, 932, 378, 207, 108,   1])
['it', 's', 'been', 'a', 'long', 'time', 'since', 'i', 've', 'done', 'anything', 'like', 'that', 'EOS']
tensor([14,  9, 14,  8,  4,  4,  5,  8,  7,  9,  5,  7,  6,  9, 10,  6,  5,  5,
         9, 10,  9,  4,  8,  6,  8, 10, 13,  6,  7,  5,  5,  7])
['ca', 'fait', 'longtemps', 'que', 'je', 'n', 'ai', 'rien', 'fait', 'de', 'tel', 'EOS', 'PAD']


In [8]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, hidden_size, embedding_dim):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size)

    def forward(self, input, input_lengths):
        embedded = self.embedding(input)
        packed = pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        h, _ = self.gru(packed)
        return h


class Decoder(nn.Module):
    def __init__(
        self, output_vocab_size, hidden_size, embedding_dim, max_length=MAX_LEN
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.max_length = max_length
        self.embedding = nn.Embedding(output_vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size)
        self.to_vocab = nn.Linear(hidden_size, output_vocab_size)

    def forward(self, input, hidden):
        """
        Input est soit
        * Mode contraint : Les vrais mots de la phrase
        * Mode non contraint : Le mot précédent prédit par le décodeur
        """
        output = self.embedding(input) # .view(1, 1, -1)
        ic(output.size())
        # output = F.relu(output)
        output, _ = self.gru(output, hidden)
        # output = self.softmax(self.to_vocab(output[0]))
        output = self.to_vocab(output)
        return output

    def generate(self, hidden, lens_seq=None, teacher_forcing=False):
        inputs = torch.tensor([Vocabulary.SOS]).to(device)
        outputs = []

        for i in range(self.max_length if lens_seq is None else lens_seq):
            output = self.forward(inputs, hidden)
            topv, topi = output.topk(1)
            if topi.item() == Vocabulary.EOS:
                break
            else:
                outputs.append(topi.item())
                inputs = topi.squeeze().detach()

        return outputs


def run_epoch(
    loader,
    encoder,
    decoder,
    loss_fn,
    optimizer=None,
    logger=None,
    device="cuda",
    num_classes=18,
):
    loss_list = []
    acc = tm.classification.Accuracy(task="multiclass", num_classes=num_classes)
    acc.to(device)
    encoder.to(device)
    decoder.to(device)
    if optimizer:
        encoder.train()
        decoder.train()
    else:
        encoder.eval()
        decoder.eval()

    if optimizer:
        encoder_optimizer, decoder_optimizer = optimizer

    for x, len_x, y, len_y in loader:
        coin_flip = int(torch.rand(1)) + 1  # stay on teacher forcing mode yet
        x = x.to(device)
        y = y.to(device)

        # Encoder part
        h_encoder = encoder(x, len_x)
        # Decoder part
        if coin_flip:  # teacher forcing mode
            h_decoder = decoder(y, h_encoder)
        else:
            ...
            decoder()

        loss = loss_fn(h_decoder, y)
        loss_list.append(loss.item())
        acc(h_decoder.argmax(1), y)

        # backward if we are training
        if optimizer:
            optimizer[0].zero_grad()
            optimizer[1].zero_grad()
            loss.backward()
            optimizer[0].step()
            optimizer[1].step()
    return np.array(loss_list).mean(), acc.compute().item()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lr = 0.001
lr_encoder = lr
lr_decoder = lr
nb_epoch = 10

len_voc_origin = len(vocEng)
len_voc_dest = len(vocFra)
loss_fn = nn.CrossEntropyLoss()

encoder = Encoder(len_voc_origin, 32, 64)
decoder = Decoder(len_voc_dest, 32, 64)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr_encoder)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr_decoder)
# optimizer = torch.optim.Adam(encoder.parameters(), lr=lr)
# optimizer.add_param_group(decoder.parameters())

for epoch in tqdm(range(nb_epoch)):
    mean_train_loss, acc_train = run_epoch(
        train_loader,
        encoder,
        decoder,
        loss_fn,
        optimizer=(encoder_optimizer, decoder_optimizer),
        device=device,
    )
    mean_test_loss, acc_test = run_epoch(
        test_loader, encoder, decoder, loss_fn, device=device
    )
    ic(mean_train_loss)
    ic(acc_train)
    ic(mean_test_loss)
    ic(acc_test)

  0%|          | 0/10 [00:00<?, ?it/s]

ic| output.size(): torch.Size([15, 32, 64])
  0%|          | 0/10 [00:00<?, ?it/s]


AttributeError: 'PackedSequence' object has no attribute 'dim'