# RNN & Attention: HW

Привет! Это твоё домашнее задание: сделать модель, которая может переводить тексты с немецкого языка в англиский. Для обучения будет использоваться датасет [wmt-14](https://huggingface.co/datasets/wmt14). Для проверки будет использоваться BLEU на тестовой выборке и 10 примеров перевода вашей модели. В этом ноутбуке есть скелет для обучения модели трансформера. Но вы можете пользоваться и RNN, если вы считаете что можете обучить её под эту задачу. Главное -- получить `submission.yaml`, используя нейросети.

**!Внимание!** В этой домашней работе нельзя пользоваться библиотекой `transformers`.

In [None]:
import subprocess
import sys
import math


IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    subprocess.run("pip install datasets nltk gensim einops evaluate", shell=True)
    subprocess.run("python -m nltk.downloader punkt", shell=True)

In [None]:
import torch
import nltk
import einops
import evaluate
import pyarrow
from datasets import load_dataset, arrow_dataset
from tqdm import trange, tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42);

In [None]:
bleu = evaluate.load("bleu")

# Данные

В этой части подготовьте данные для обучения. Не забудьте добавить "BOS", "EOS" и "UNK" токены в ваши словари.

In [None]:
wmt14 = load_dataset("wmt14", "de-en")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tokenizer = nltk.WordPunctTokenizer()
lemmatizer = nltk.WordNetLemmatizer()

nltk.download("wordnet")


def tokenize_pipeline(sentence):
    tokens = tokenizer.tokenize(sentence)
    return [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
dataset_table = pyarrow.Table.from_pydict(wmt14["train"].shuffle()[:20000])
wmt14_train = arrow_dataset.Dataset(dataset_table)
tokenized_en = (
    [tokenize_pipeline(sentence["en"]) for sentence in wmt14_train["translation"]]
    + [
        tokenize_pipeline(sentence["en"])
        for sentence in wmt14["validation"]["translation"]
    ]
    + [tokenize_pipeline(sentence["en"]) for sentence in wmt14["test"]["translation"]]
)
tokenized_de = (
    [tokenize_pipeline(sentence["de"]) for sentence in wmt14_train["translation"]]
    + [
        tokenize_pipeline(sentence["de"])
        for sentence in wmt14["validation"]["translation"]
    ]
    + [tokenize_pipeline(sentence["de"]) for sentence in wmt14["test"]["translation"]]
)
all_tokenized_en_words = set(word for words in tokenized_en for word in words)
all_tokenized_de_words = set(word for words in tokenized_de for word in words)
en_words_to_ids = {word: idx + 2 for idx, word in enumerate(all_tokenized_en_words)}
de_words_to_ids = {word: idx + 2 for idx, word in enumerate(all_tokenized_de_words)}
de_words_to_ids["[BOS]"] = 0
de_words_to_ids["[EOS]"] = 1
en_words_to_ids["[BOS]"] = 0
en_words_to_ids["[EOS]"] = 1
en_ids_to_words = {k: v for v, k in en_words_to_ids.items()}
tokenized_en[0], tokenized_de[0]

(['The',
  'Bank',
  'of',
  'France',
  'wa',
  'indeed',
  'threatened',
  'by',
  'some',
  'revolutionary',
  'battalion',
  'but',
  'the',
  'Communards',
  'never',
  'took',
  'over',
  'the',
  'Bank'],
 ['Ein',
  'Grund',
  'warum',
  'sie',
  'nicht',
  'so',
  'gut',
  'funktionierte',
  'war',
  'auch',
  'das',
  'diese',
  'Frage',
  'sehr',
  'umstritten',
  'war',
  'Ich',
  'schrieb',
  'einmal',
  'einen',
  'Artikel',
  'für',
  'ein',
  'populäres',
  'Magazin',
  'mit',
  'dem',
  'Titel',
  'Le',
  'piège',
  'coopératif',
  'Die',
  'kooperative',
  'Falle'])

In [None]:
len(all_tokenized_en_words), len(
    all_tokenized_de_words
)  # deutsch seems to be larger due to compound words

(37673, 67915)

In [None]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(
        self, tokenizer, de_words_to_ids, en_words_to_ids, dataset, max_len=32
    ):
        self.tokenizer = tokenizer
        self.de_words_to_ids = de_words_to_ids
        self.en_words_to_ids = en_words_to_ids

        def tokenize_de_sentence(example):
            return {"tokens_de": self.tokenizer(example["translation"]["de"])}

        def tokenize_en_sentence(example):
            return {"tokens_en": self.tokenizer(example["translation"]["en"])}

        def convert_de_words_to_ids(example):
            return {
                "ids_de": [
                    self.de_words_to_ids[token] for token in example["tokens_de"]
                ]
            }

        def convert_en_words_to_ids(example):
            return {
                "ids_en": [
                    self.en_words_to_ids[token] for token in example["tokens_en"]
                ]
            }

        dataset = dataset.map(tokenize_de_sentence)
        dataset = dataset.map(tokenize_en_sentence)
        dataset = dataset.map(convert_de_words_to_ids)
        self.dataset = dataset.map(convert_en_words_to_ids)
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        example = self.dataset[index]
        tokens_ids_de = [0] + example["ids_de"][: self.max_len - 1]
        tokens_ids_en = [0] + example["ids_en"][: self.max_len - 1]
        if len(tokens_ids_de) < self.max_len:
            tokens_ids_de += [1 for _ in range(self.max_len - len(tokens_ids_de))]
        if len(tokens_ids_en) < self.max_len:
            tokens_ids_en += [1 for _ in range(self.max_len - len(tokens_ids_en))]
        return torch.tensor(tokens_ids_de), torch.tensor(tokens_ids_en)

In [None]:
train_dataset = TranslationDataset(
    tokenize_pipeline, de_words_to_ids, en_words_to_ids, wmt14_train
)
valid_dataset = TranslationDataset(
    tokenize_pipeline, de_words_to_ids, en_words_to_ids, wmt14["validation"]
)
test_dataset = TranslationDataset(
    tokenize_pipeline, de_words_to_ids, en_words_to_ids, wmt14["test"]
)
train_dataset[1]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

(tensor([    0, 56242, 49339, 66999, 17489, 45050, 23389, 11726, 17508, 25813,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]),
 tensor([    0, 31091, 12219, 35468, 22693,  1835,  9058, 34270, 26962, 20119,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]))

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=128)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=128)

# Model

Сделайте модель, которая может в перевод. Для этой модели потребуется сделать `Encoder` и `Decoder`. Первый будет брать текст на немецком и отдавать информацию про него. Decoder будет брать информацию про немецкий текст и превращать его в английский.

Для слоев Encoder можете скопировать код из семинара:

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, hidden_dim: int):
        super().__init__()
        self.linear_0 = torch.nn.Linear(hidden_dim, 4 * hidden_dim)
        self.linear_1 = torch.nn.Linear(4 * hidden_dim, hidden_dim)

    def forward(self, hidden_state):
        return self.linear_1(torch.relu(self.linear_0(hidden_state))) + hidden_state


def attention(K, V, Q, num_head):
    batch_size, seq_len, hidden_dim = Q.size()
    K = K.reshape(batch_size, seq_len, -1, num_head)
    Q = Q.reshape(batch_size, seq_len, -1, num_head)
    V = V.reshape(batch_size, seq_len, -1, num_head)
    attention = torch.softmax(
        torch.einsum("bscl,btcl->bstl", [K, Q]) / math.sqrt(hidden_dim // num_head),
        dim=1,
    )
    result_headed = torch.einsum("bstl,btcl->btcl", [attention, V])
    return result_headed.reshape(batch_size, seq_len, hidden_dim)

Для Decoder слоя потребуется модифицировать код. Не забудьте, что для декодера требуется другой механизм внимания.

In [None]:
class AttentionModule(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int):
        super().__init__()
        self.q_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.k_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.v_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.out_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.num_heads = num_heads

    def forward(self, hidden_state):
        Q = self.q_linear(hidden_state)
        K = self.k_linear(hidden_state)
        V = self.v_linear(hidden_state)
        attention_output = attention(K, V, Q, self.num_heads)
        return self.out_linear(attention_output) + hidden_state


class DecoderAttentionModule(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int):
        super().__init__()
        self.q_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.k_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.v_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.out_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.num_heads = num_heads

    def forward(self, hidden_state, decoder_hidden_state):
        Q = self.q_linear(hidden_state)
        K = self.k_linear(hidden_state)
        V = self.v_linear(decoder_hidden_state)
        attention_output = attention(K, V, Q, self.num_heads)
        return self.out_linear(attention_output) + hidden_state

In [None]:
class EncoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads):
        super().__init__()
        self.attention = AttentionModule(hidden_dim, num_heads)
        self.mlp = MLP(hidden_dim)

    def forward(self, inputs):
        attention = self.attention(inputs)
        mlp = self.mlp(attention)
        return mlp

In [None]:
class Encoder(torch.nn.Module):
    def __init__(
        self, de_dictionary_size: int, hidden_dim: int, num_heads, max_seq_len=64
    ):
        super().__init__()
        self.word_embedding = torch.nn.Embedding(de_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.transformer = EncoderTransformerLayer(hidden_dim, num_heads)
        self.to(DEVICE)

    def forward(self, inputs):
        arange_tensor = torch.arange(inputs.size(-1)).to(DEVICE)
        word_embs = self.word_embedding(inputs)
        pos_embs = self.pos_embedding(arange_tensor)
        embs = word_embs + pos_embs
        out = self.transformer(embs)
        out = self.transformer(out)
        out = self.transformer(out)
        return out

In [None]:
encoder = Encoder(len(all_tokenized_de_words), 8, num_heads=8)
encoder(train_dataset[0][0].unsqueeze(0).to(DEVICE)).shape

torch.Size([1, 32, 8])

In [None]:
class DecoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads):
        super().__init__()
        self.self_attention = AttentionModule(hidden_dim, num_heads)
        self.out_attention = DecoderAttentionModule(hidden_dim, num_heads)
        self.mlp = MLP(hidden_dim)

    def forward(self, inputs, encoder_layer_output):
        self_attention = self.self_attention(inputs)
        out_attention = self.out_attention(encoder_layer_output, self_attention)
        mlp = self.mlp(out_attention)
        return mlp

In [None]:
class Decoder(torch.nn.Module):
    def __init__(
        self, en_dictionary_size: int, hidden_dim: int, num_heads, max_seq_len=64
    ):
        super().__init__()
        self.word_embedding = torch.nn.Embedding(en_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.transformer = DecoderTransformerLayer(hidden_dim, num_heads)
        self.lm_head = torch.nn.Linear(hidden_dim, en_dictionary_size)
        self.to(DEVICE)

    def forward(self, inputs, encoder_output):
        arange_tensor = torch.arange(inputs.size(-1)).to(DEVICE)
        word_embs = self.word_embedding(inputs)
        pos_embs = self.pos_embedding(arange_tensor)
        embs = word_embs + pos_embs
        out = self.transformer(embs, encoder_output)
        out = self.transformer(out, encoder_output)
        out = self.transformer(out, encoder_output)
        return out

In [None]:
decoder = Decoder(len(all_tokenized_en_words), 8, 8)
decoder(
    train_dataset[0][1].unsqueeze(0).to(DEVICE), torch.rand(1, 32, 8).to(DEVICE)
).shape

torch.Size([1, 32, 8])

In [None]:
class TranslationModel(torch.nn.Module):
    def __init__(
        self,
        de_dictionary_size: int,
        en_dictionary_size: int,
        hidden_dim: int,
        en_ids_to_words,
        num_heads=8,
    ):
        super().__init__()
        self.encoder = Encoder(de_dictionary_size, hidden_dim, num_heads)
        self.decoder = Decoder(en_dictionary_size, hidden_dim, num_heads)
        self.pred = torch.nn.Linear(hidden_dim, en_dictionary_size)
        self.en_ids_to_words = en_ids_to_words
        self.to(DEVICE)

    def forward(self, inputs):
        original_ids, translation_ids = inputs
        original_ids = original_ids.long().to(DEVICE)
        translation_ids = translation_ids.long().to(DEVICE)
        encoder_output = self.encoder(original_ids)
        decoder_output = self.decoder(translation_ids, encoder_output)
        outputs = self.pred(decoder_output)
        outputs = torch.nn.Softmax(dim=-1)(outputs)
        return outputs.permute(0, 2, 1)

    def translate(self, inputs):
        out = self(inputs)
        out = out.argmax(-2).split(1)
        out = [
            " ".join([en_ids_to_words[id.item()] for id in sentence.squeeze()])
            for sentence in out
        ]
        return out

In [None]:
model = TranslationModel(len(de_words_to_ids), len(en_words_to_ids), 8, en_ids_to_words)
model((train_dataset[0][0].unsqueeze(0), train_dataset[0][1].unsqueeze(0))).shape

torch.Size([1, 37675, 32])

In [None]:
model.translate((torch.rand(2, 16), torch.rand(2, 16)))

['Cox Cox scratching distributes Cox Jasna accept Receive Niebler individualistic Werbefotograf individualistic Cox Certop Cox Oplonti',
 'Cox Cox scratching distributes Cox Jasna accept Receive Niebler individualistic Werbefotograf individualistic Cox Certop Cox Oplonti']

Сделайте модель, оптимиизатор и лосс функцию. В нашем случае лосс функция будет проверять предсказанию токенов на каждой позиции -- по сути классификатор на каждую позицию.

In [None]:
model = TranslationModel(
    len(de_words_to_ids), len(en_words_to_ids), 32, en_ids_to_words
)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(1):
    for batch in train_dataloader:
        optimizer.zero_grad()
        result = model(batch)
        loss = criterion(result, batch[1].to(DEVICE))
        with torch.no_grad():
            loss.backward()
            optimizer.step()
    print(f"Epoch: {epoch}, loss: {loss.item()}")

Epoch: 0, loss: 10.246766090393066


In [None]:
de_ids_to_words = {k: v for v, k in de_words_to_ids.items()}
" ".join((de_ids_to_words[id.item()] for id in train_dataset[1][0]))

'[BOS] In den USA ist individuelle Unabhängigkeit von überragender Bedeutung [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS] [EOS]'

Чтобы получить перевод, надо сделать функцию для декодинга. Она будет брать предсказания токена на последней позиции и отдавать нужный токен.

In [None]:
def get_last_token_prediction(prefix, original, model):
    output = model((original, prefix))
    output = output[0].T[-1]
    output = output.argmax()
    return output

In [None]:
last_token_eos = False
original = "Guten Morgen!"
prefix = ""
while not last_token_eos:
    tokenized_original = [0] + [
        de_words_to_ids[token] for token in tokenize_pipeline(original)
    ]
    tokenized_original += [1] * (32 - len(tokenized_original))
    tokenized_original = torch.tensor(tokenized_original).unsqueeze(0)
    tokenized_prefix = [0] + [
        en_words_to_ids[token] for token in tokenize_pipeline(prefix)
    ]
    tokenized_prefix += [1] * (32 - len(tokenized_original))
    tokenized_prefix = torch.tensor(tokenized_prefix).unsqueeze(0)
    last_token_id = get_last_token_prediction(
        tokenized_prefix, tokenized_original, model
    )
    token = en_ids_to_words[last_token_id.item()]
    prefix += token
    last_token_eos = token == "[EOS]"

print(prefix)

[EOS]


In [None]:
def predict(prefix, original, model):
    last_token_eos = False
    while not last_token_eos:
        tokenized_original = [0] + [
            de_words_to_ids[token] for token in tokenize_pipeline(original)
        ]
        tokenized_original += [1] * (32 - len(tokenized_original))
        tokenized_original = torch.tensor(tokenized_original).unsqueeze(0)
        tokenized_prefix = [0] + [
            en_words_to_ids[token] for token in tokenize_pipeline(prefix)
        ]
        tokenized_prefix += [1] * (32 - len(tokenized_original))
        tokenized_prefix = torch.tensor(tokenized_prefix).unsqueeze(0)
        last_token_id = get_last_token_prediction(
            tokenized_prefix, tokenized_original, model
        )
        token = en_ids_to_words[last_token_id.item()]
        prefix += token
        last_token_eos = token == "[EOS]"
    return prefix


predict("", "Guten Morgen!", model)

'[EOS]'

# Result

В качестве результата вы должны предоставить bleu вашей модели на тестовой выборке wmt14 и перевод 10 предложений с немецкого на английский.

In [None]:
answers = []
for element in test_dataset:
    last_token_eos = False
    prefix = ""
    while not last_token_eos:
        tokenized_prefix = [0] + [
            en_words_to_ids[token] for token in tokenize_pipeline(prefix)
        ]
        tokenized_prefix += [1] * (32 - len(tokenized_prefix))
        tokenized_prefix = torch.tensor(tokenized_prefix).unsqueeze(0)
        last_token_id = get_last_token_prediction(
            torch.zeros(1, 32), element[0].unsqueeze(0), model
        )
        token = en_ids_to_words[last_token_id.item()]
        prefix += token
        last_token_eos = token == "[EOS]"
    answers += [prefix]

In [None]:
true_answers = [
    " ".join(
        [
            en_ids_to_words[word]
            for word in sentence[1].tolist()
            if not word == 0 and not word == 1
        ]
    )
    for sentence in test_dataset
]
true_answers[0]

'Gutach Increased safety for pedestrian'

In [None]:
test_bleu = bleu.compute(predictions=answers, references=true_answers)
test_bleu

{'bleu': 0.0,
 'precisions': [0.0, 0.0, 0.0, 0.0],
 'brevity_penalty': 0.005011413359241047,
 'length_ratio': 0.158830062939652,
 'translation_length': 9009,
 'reference_length': 56721}

In [None]:
de_sentences = [
    "Gutach: Noch mehr Sicherheit für Fußgänger",
    "Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?",
    "Dies bestätigt auch Peter Arnold vom Landratsamt Offenburg.",
    'Daher sei der Bau einer weiteren Ampel mehr als notwendig: "Sicherheit geht hier einfach vor", so Arnold.',
    "Pro Fahrtrichtung gibt es drei Lichtanlagen.",
    "Drückt der Fußgänger den Ampelknopf, testet der obere Radarsensor die Verkehrslage.",
    "Ein weiteres Radarsensor prüft, ob die Grünphase für den Fußgänger beendet werden kann.",
    "Josef Winkler schreibt sich seit mehr als 30 Jahren die Nöte seiner Kindheit und Jugend von der Seele.",
    "Dabei scheint Regisseur Fresacher dem Text wenig zu vertrauen.",
    "Sie werden hart angefasst, mit dem Kopf unter Wasser getaucht, mit ihren Abendroben an die Wand getackert.",
]
en_sentences = [predict("", de_sentence, model) for de_sentence in de_sentences]

In [None]:
import yaml


submission = {
    "tasks": [{"task1": {"answer": test_bleu}}, {"task2": {"answer": en_sentences}}]
}

yaml.safe_dump(submission, open("submission.yaml", "w"))