In [14]:
import itertools
import logging
from tqdm import tqdm
import torchmetrics as tm
from datamaestro import prepare_dataset
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
from typing import List
import time
from icecream import ic

logging.basicConfig(level=logging.INFO)

ds = prepare_dataset("org.universaldependencies.french.gsd")


# Format de sortie décrit dans
# https://pypi.org/project/conllu/


class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement s'il n'est pas connu
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """

    OOVID = 1
    PAD = 0

    def __init__(self, oov: bool):
        """oov : autorise ou non les mots OOV"""
        self.oov = oov
        self.id2word = ["PAD"]
        self.word2id = {"PAD": Vocabulary.PAD}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self, idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self, idx: List[int]):
        return [self.getword(i) for i in idx]


class TaggingDataset:
    def __init__(self, data, words: Vocabulary, tags: Vocabulary, adding=True):
        self.sentences = []

        for s in data:
            self.sentences.append(
                (
                    [words.get(token["form"], adding) for token in s],
                    [tags.get(token["upostag"], adding) for token in s],
                )
            )

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, ix):
        return self.sentences[ix]


def collate_fn(batch):
    """Collate using pad_sequence"""
    return tuple(
        pad_sequence([torch.LongTensor(b[j]) for b in batch]) for j in range(2)
    )


logging.info("Loading datasets...")
words = Vocabulary(True)
tags = Vocabulary(False)
train_data = TaggingDataset(ds.train, words, tags, True)
dev_data = TaggingDataset(ds.validation, words, tags, True)
test_data = TaggingDataset(ds.test, words, tags, False)


logging.info("Vocabulary size: %d", len(words))
logging.info("Tags size: %d", len(tags))
BATCH_SIZE = 100
BATCH_SIZE = 32
LEN_WORDS = len(words)
LEN_TAG = len(tags)
train_loader = DataLoader(
    train_data, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True
)
dev_loader = DataLoader(dev_data, collate_fn=collate_fn, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, collate_fn=collate_fn, batch_size=BATCH_SIZE)

INFO:root:Loading datasets...
INFO:root:Vocabulary size: 42932
INFO:root:Tags size: 18


In [23]:
def train_epoch(loader, model, loss_fn, optimizer, logger=None, cuda=False, num_classes=18):
    model.train()
    loss_list = []
    acc = tm.classification.Accuracy(task="multiclass", num_classes=num_classes)
    for input, target in loader:
        if cuda:  # only with GPU, and not with CPU
            input = input.cuda()
            target = target.cuda()

        ic(input.size())
        ic(target.size())
        output = model(input)
        ic(output.size())
        loss = loss_fn(output, target)
        loss_list.append(loss.item)
        # backward if we are training
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return np.array(loss_list).mean()


def evaluate(loader, model, loss_fn, cuda=False, num_classes=18):
    model.eval()
    acc = tm.classification.Accuracy(task="multiclass" , num_classes=num_classes)
    for input, target in loader:
        if cuda:  # only with GPU, and not with CPU
            input = input.cuda()
            target = target.cuda()

        # forward
        output = model(input)
        loss = loss_fn(output, target)

class Model(nn.Module):
    def __init__(
        self,
        embedding_dim,
        hidden_size,
        vocab_size,
        tag_size,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size)
        self.f_h = nn.Linear(hidden_size, tag_size)

    def forward(self, x):
        x = self.embedding(x)
        h, (_, _) = self.rnn(x)
        return h

    def decode(h):
        return self.f_h(h)

cuda = torch.cuda.is_available()


lr = 0.001
nb_epoch = 10


model = Model(32, 64, LEN_WORDS, LEN_TAG)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in tqdm(range(nb_epoch)):
    mean_loss, acc_test = train_epoch(train_loader, model, loss_fn, optimizer, cuda=cuda)
    acc_test = evaluate(test_loader, model, loss_fn, cuda=cuda)

  0%|          | 0/10 [00:00<?, ?it/s]ic| input.size(): torch.Size([59, 32])
ic| target.size(): torch.Size([59, 32])
ic| output.size(): torch.Size([59, 32, 64])
  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: Expected target size [59, 64], got [59, 32]

In [8]:
for x,y in train_loader:
    print(x.size())
    print(y.size())
    for i in range(x.size(1)):
        print(words.getwords(x[:, i]))
        break
    for i in range(y.size(1)):
        print(y[:, i])
        break
    break

torch.Size([83, 32])
torch.Size([83, 32])
['Mais', 'le', 'courageux', 'garçon', 'devra', 'revenir', 'avant', 'le', 'crépuscule', '...', 'alors', 'que', "l'", 'orage', 'qui', "s'", 'approche', 'lui', 'mettra', 'des', 'bâtons', 'dans', 'les', 'roues', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
tensor([13,  1,  3,  2,  5,  5,  7,  1,  2, 10,  6,  8,  1,  2,  9,  9,  5,  9,
         5,  1,  2,  7,  1,  2, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,

In [32]:
for x,y in train_loader:
    print(x.size())
    print(y.size())
    print(words.getwords(x))
    break

torch.Size([6, 1])
torch.Size([6, 1])
['Il', 'fut', 'un', 'antisémite', 'enragé', '.']
