In [0]:
!pip3 -qq install torch==0.4.1
!pip -qq install torchtext==0.3.1
!pip -qq install spacy==2.0.16
!pip -qq install torchvision==0.2.1
!python -m spacy download en
!pip install sacremoses==0.0.5
!pip install subword_nmt==0.3.5
!wget -qq http://www.manythings.org/anki/rus-eng.zip 
!unzip rus-eng.zip

In [0]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


if torch.cuda.is_available():
    from torch.cuda import FloatTensor, LongTensor
    DEVICE = torch.device('cuda')
else:
    from torch import FloatTensor, LongTensor
    DEVICE = torch.device('cpu')

np.random.seed(42)

# Attention

В прошлый раз мы реализовали простую Seq2seq модель:

![](https://raw.githubusercontent.com/tensorflow/nmt/master/nmt/g3doc/img/seq2seq.jpg =x400)  
*From [tensorflow/nmt](https://github.com/tensorflow/nmt)*

Основной её недостаток - вся информация об исходном тексте кодируется в единственный вектор фиксированного размера. Но очевидно же, что идея эта так себе.

Давайте запоминать все скрытые состояния энкодера, а не только последнее.

Дальше, для вычисления нового слова при генерации найдем сначала представление уже сгенерированного контекста (по которому обычно и генерируется следующее слово).  
По этому представлению посчитаем оценки полезности состояний энкодера: `attention weights` на картинке ниже. Чем выше вес - тем более полезно состояние. (Можно, кстати, представлять, что в предыдущем варианте мы просто давали всем состояниям кроме последнего вес 0, а последнему - 1).

С этими весами состояния энкодера суммируются, и мы получаем взвешенный вектор-представление контекста. Опять вектор?! Но теперь этот вектор получен для конкретного генерируемого слова - это же гораздо лучше, чем пытаться сделать один вектор сразу для всех генерируемых слов.

![attention](https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg =x400)  
From [Neural Machine Translation (seq2seq) Tutorial](https://www.tensorflow.org/tutorials/seq2seq).

Более наглядно это может быть в [динамике](https://raw.githubusercontent.com/yandexdataschool/nlp_course/master/resources/attention_mechanism.gif) (из cs224n + shad nlp course).

В результате получаются такие красивые картинки с визуализацией аттеншена:   
![att-vis](https://www.tensorflow.org/images/seq2seq/attention_vis.jpg =x500)

Яркость ячейки показывает насколько много внимания уделяла модель данному слову на исходном языке при генерации соответствующего ему слова.

Очень красивая статья с демонстрацией attention'а: [Attention and Augmented Recurrent Neural Networks](https://distill.pub/2016/augmented-rnns/).

## Подготовка

Возьмем те же данные, что и в прошлый раз:

In [0]:
!shuf -n 10 rus.txt

Токенизируем их:

In [0]:
from torchtext.data import Field, Example, Dataset, BucketIterator

BOS_TOKEN = '<s>'
EOS_TOKEN = '</s>'

source_field = Field(tokenize='spacy', init_token=None, eos_token=EOS_TOKEN, lower=True)
target_field = Field(tokenize='moses', init_token=BOS_TOKEN, eos_token=EOS_TOKEN, lower=True)
fields = [('source', source_field), ('target', target_field)]

In [0]:
from tqdm import tqdm

MAX_TOKENS_COUNT = 16
SUBSET_SIZE = .3

examples = []
with open('rus.txt') as f:
    for line in tqdm(f, total=328190):
        source_text, target_text = line.split('\t')
        source_text = source_field.preprocess(source_text)
        target_text = target_field.preprocess(target_text)
        if len(source_text) <= MAX_TOKENS_COUNT and len(target_text) <= MAX_TOKENS_COUNT:
            if np.random.rand() < SUBSET_SIZE:
                examples.append(Example.fromlist([source_text, target_text], fields))

Построим датасеты:

In [0]:
dataset = Dataset(examples, fields)

train_dataset, test_dataset = dataset.split(split_ratio=0.85)

print('Train size =', len(train_dataset))
print('Test size =', len(test_dataset))

source_field.build_vocab(train_dataset, min_freq=2)
print('Source vocab size =', len(source_field.vocab))

target_field.build_vocab(train_dataset, min_freq=2)
print('Target vocab size =', len(target_field.vocab))

train_iter, test_iter = BucketIterator.splits(
    datasets=(train_dataset, test_dataset), batch_sizes=(32, 512), shuffle=True, device=DEVICE, sort=False
)

## Seq2seq модель

Старая модель выглядела так:

In [0]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, rnn_hidden_dim=256, num_layers=1, bidirectional=False):
        super().__init__()

        self._emb = nn.Embedding(vocab_size, emb_dim)
        self._rnn = nn.GRU(input_size=emb_dim, hidden_size=rnn_hidden_dim, 
                           num_layers=num_layers, bidirectional=bidirectional)

    def forward(self, inputs, hidden=None):
        """
        input: LongTensor with shape (encoder_seq_len, batch_size)
        hidden: FloatTensor with shape (1, batch_size, rnn_hidden_dim)
        """
        encoder_output, encoder_hidden = self._rnn(self._emb(inputs), hidden)
        return encoder_output, encoder_hidden

In [0]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, rnn_hidden_dim=256, attn_dim=128, num_layers=1):
        super().__init__()

        self._emb = nn.Embedding(vocab_size, emb_dim)
        self._rnn = nn.GRU(input_size=emb_dim, hidden_size=rnn_hidden_dim, num_layers=num_layers)
        self._out = nn.Linear(rnn_hidden_dim, vocab_size)

    def forward(self, inputs, encoder_output, encoder_mask, hidden=None):
        """
        input: LongTensor with shape (decoder_seq_len, batch_size)
        encoder_output: FloatTensor with shape (encoder_seq_len, batch_size, rnn_hidden_dim)
        encoder_mask: ByteTensor with shape (encoder_seq_len, batch_size) (ones in positions of <pad> tokens, zeros everywhere else)
        hidden: FloatTensor with shape (1, batch_size, rnn_hidden_dim)
        """
        embs = self._emb(inputs)
        outputs = []
        for i in range(embs.shape[0]):
            output, hidden = self._rnn(embs[i: i+1], hidden)
            
            outputs.append(output)
            
        output = torch.cat(outputs)
        return self._out(output), hidden

## Реализация attention'а

В общем случае, attention работает так: пусть у нас есть набор скрытых состояний $\mathbf{s}_1, \ldots, \mathbf{s}_m$ - представлений слов из исходного языка, полученных с помощью энкодера. И есть некоторое текущее скрытое состояние $\mathbf{h}_i$ - скажем, представление, используемое для предсказания слова на нужном нам языке.

Тогда с помощью аттеншена мы можем получить взвешенное представление контекста $\mathbf{s}_1, \ldots, \mathbf{s}_m$ - вектор $\mathbf{c}_i$:
$$
\begin{align}\begin{split}
\mathbf{c}_i &= \sum\limits_j a_{ij}\mathbf{s}_j\\
\mathbf{a}_{ij} &= \text{softmax}(f_{att}(\mathbf{h}_i, \mathbf{s}_j))
\end{split}\end{align}
$$

$f_{att}$ - функция, которая говорит, насколько хорошо $\mathbf{h}_i$ и $\mathbf{s}_j$ подходят друг другу.

Самые популярные её варианты:
- Additive attention:
$$f_{att}(\mathbf{h}_i, \mathbf{s}_j) = \mathbf{v}_a{}^\top \text{tanh}(\mathbf{W}_a\mathbf{h}_i + \mathbf{W}_b\mathbf{s}_j)$$
- Dot attention:
$$f_{att}(\mathbf{h}_i, \mathbf{s}_j) = \mathbf{h}_i^\top \mathbf{s}_j$$
- Multiplicative attention:
$$f_{att}(\mathbf{h}_i, \mathbf{s}_j) = \mathbf{h}_i^\top \mathbf{W}_a \mathbf{s}_j$$

**Задание** Реализуйте Additive attention.

In [0]:
class AdditiveAttention(nn.Module):
    def __init__(self, query_size, key_size, hidden_dim):
        super().__init__()
        
        self._query_layer = nn.Linear(query_size, hidden_dim)
        self._key_layer = nn.Linear(key_size, hidden_dim)
        self._energy_layer = nn.Linear(hidden_dim, 1)
        
    def forward(self, query, key, value, mask):
        """
        query: FloatTensor with shape (batch_size, query_size) (h_i)
        key: FloatTensor with shape (encoder_seq_len, batch_size, key_size) (sequence of s_1, ..., s_m)
        value: FloatTensor with shape (encoder_seq_len, batch_size, key_size) (sequence of s_1, ..., s_m)
        mask: ByteTensor with shape (encoder_seq_len, batch_size) (ones in positions of <pad> tokens, zeros everywhere else)
        """
        # calc f_att as a function of query, key (s_1, ..., s_m)
        # mask out pads f_att.data.masked_fill_(mask.unsqueeze(2), -float('inf')) - after softmax the masked weights would be equal to 0
        # find the context vector as a weighed sum of value (s_1, ..., s_m) with softmax-normalized f_att weights

Нужно обновить `Decoder`, чтобы он работал с attention'ом:  
![](https://image.ibb.co/fB12nq/2018-11-12-23-34-06.png =x500)  
*From [Attention and Augmented Recurrent Neural Networks](https://distill.pub/2016/augmented-rnns/#attentional-interfaces)*

На каждом шаге rnn'ки будем использовать текущее скрытое состояние декодера, чтобы определить, какие из состояний энкодера самые интересные.

Выход attention'а (текущий контекст) будем конкатенировать к эмбеддингу слова.

**Задание** Обновите `Decoder`.

In [0]:
batch = next(iter(train_iter))

Модель перевода будет просто сперва вызывать Encoder, а потом передавать его скрытое состояние декодеру в качестве начального.

In [0]:
class TranslationModel(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, emb_dim=64, rnn_hidden_dim=128, 
                 attn_dim=128, num_layers=1, bidirectional_encoder=False):
        
        super().__init__()
        
        self.encoder = Encoder(source_vocab_size, emb_dim, rnn_hidden_dim, num_layers, bidirectional_encoder)
        self.decoder = Decoder(target_vocab_size, emb_dim, rnn_hidden_dim, attn_dim, num_layers)
        
    def forward(self, source_inputs, target_inputs):
        encoder_mask = source_inputs == 1.  # find mask for padding inputs
        encoder_output, encoder_hidden = self.encoder(source_inputs)
        
        return self.decoder(target_inputs, encoder_output, encoder_mask, encoder_hidden)

In [0]:
model = TranslationModel(source_vocab_size=len(source_field.vocab), target_vocab_size=len(target_field.vocab)).to(DEVICE)

model(batch.source, batch.target)

## Тренировка модели

In [0]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_model(model, iterator):
    model.eval()
    refs, hyps = [], []
    bos_index = iterator.dataset.fields['target'].vocab.stoi[BOS_TOKEN]
    eos_index = iterator.dataset.fields['target'].vocab.stoi[EOS_TOKEN]
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            encoder_output, encoder_hidden = model.encoder(batch.source)
            mask = batch.source == 1.
            
            hidden = encoder_hidden
            result = [LongTensor([bos_index]).expand(1, batch.target.shape[1])]
            
            for _ in range(30):
                step, hidden, _ = model.decoder(result[-1], encoder_output, mask, hidden)
                step = step.argmax(-1)
                result.append(step)
            
            targets = batch.target.data.cpu().numpy().T
            eos_indices = (targets == eos_index).argmax(-1)
            eos_indices[eos_indices == 0] = targets.shape[1]

            targets = [target[:eos_ind] for eos_ind, target in zip(eos_indices, targets)]
            refs.extend(targets)
            
            result = torch.cat(result)
            result = result.data.cpu().numpy().T
            eos_indices = (result == eos_index).argmax(-1)
            eos_indices[eos_indices == 0] = result.shape[1]

            result = [res[:eos_ind] for eos_ind, res in zip(eos_indices, result)]
            hyps.extend(result)
            
    return corpus_bleu([[ref] for ref in refs], hyps) * 100

In [0]:
evaluate_model(model, test_iter)

In [0]:
import math
from tqdm import tqdm
tqdm.get_lock().locks = []


def do_epoch(model, criterion, data_iter, optimizer=None, name=None):
    epoch_loss = 0
    
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    batches_count = len(data_iter)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batches_count) as progress_bar:
            for i, batch in enumerate(data_iter):                
                logits, _, _ = model(batch.source, batch.target)
                
                target = torch.cat((batch.target[1:], batch.target.new_ones((1, batch.target.shape[1]))))
                loss = criterion(logits.view(-1, logits.shape[-1]), target.view(-1))

                epoch_loss += loss.item()

                if optimizer:
                    optimizer.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), 1.)
                    optimizer.step()

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}, PPX = {:.2f}'.format(name, loss.item(), 
                                                                                         math.exp(loss.item())))
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}, PPX = {:.2f}'.format(
                name, epoch_loss / batches_count, math.exp(epoch_loss / batches_count))
            )
            progress_bar.refresh()

    return epoch_loss / batches_count


def fit(model, criterion, optimizer, train_iter, epochs_count=1, val_iter=None):
    best_val_loss = None
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss = do_epoch(model, criterion, train_iter, optimizer, name_prefix + 'Train:')
        
        if not val_iter is None:
            val_loss = do_epoch(model, criterion, val_iter, None, name_prefix + '  Val:')
            print('\nVal BLEU = {:.2f}'.format(evaluate_model(model, val_iter)))

In [0]:
model = TranslationModel(source_vocab_size=len(source_field.vocab), target_vocab_size=len(target_field.vocab)).to(DEVICE)

pad_idx = target_field.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(DEVICE)

optimizer = optim.Adam(model.parameters())

fit(model, criterion, optimizer, train_iter, epochs_count=30, val_iter=test_iter)

## Визуализация результатов

In [0]:
def greedy_decode(model, source_text, source_field, target_field):
    bos_index = target_field.vocab.stoi[BOS_TOKEN]
    eos_index = target_field.vocab.stoi[EOS_TOKEN]
    
    model.eval()
    with torch.no_grad():
        result, attentions = [], []
        source = source_field.preprocess(source_text)
        inputs = source_field.process([source]).to(DEVICE)
        
        encoder_output, encoder_hidden = model.encoder(inputs)
        encoder_mask = torch.zeros_like(inputs).byte()
        
        hidden = encoder_hidden
        step = LongTensor([[bos_index]])
        
        for _ in range(50):
            step, hidden, attention = model.decoder(step, encoder_output, encoder_mask, hidden)
            step = step.argmax(-1)
            attentions.append(attention.squeeze(1))
          
            if step.squeeze().item() == eos_index:
                break
            
            result.append(step.item())   
        result = [target_field.vocab.itos[ind.squeeze().item()] for ind in result]
        return source, result, torch.cat(attentions, -1).data.cpu().numpy()

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_heatmap(src, trg, scores):

    fig, ax = plt.subplots()
    heatmap = ax.pcolor(scores, cmap='viridis')

    ax.set_xticklabels(trg, minor=False, rotation=45)
    ax.set_yticklabels(src, minor=False)

    ax.xaxis.tick_top()
    ax.set_xticks(np.arange(scores.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(scores.shape[0]) + 0.5, minor=False)
    ax.invert_yaxis()

    plt.colorbar(heatmap)
    plt.show()

In [0]:
source, result, attentions = greedy_decode(model, "I didn't pay.", source_field, target_field)

In [0]:
plot_heatmap(source + ['</s>'], result + ['</s>'], attentions)

## Улучшение модели

**Задание** Попробуйте другие варианты attention'а (из приведенных выше).

**Задание** Попробуйте приемы из тех, что были в предыдущем ноутбуке: bpe, dropout, bidirectional or multi-layer encoder.

# Image Captioning with Attention

Attention может работать не только для текстов. Мы вполне можем аттентиться и на картинки:

![](https://cdn-images-1.medium.com/max/2000/0*YCeQbqU6CVxzpave.)  
*From [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044)*

В качестве энкодера будет выступать сверточная сеть. Модель теперь должна научиться генерировать маску внимания на каждом шаге:

![](http://kelvinxu.github.io/projects/diags/model_diag.png =x300)  
*From [http://kelvinxu.github.io/projects/capgen.html](http://kelvinxu.github.io/projects/capgen.html)*

Скачаем данные:

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id': '1qM6fJuaOqhTES17kU_rz9Ydxz540bSJ6'})
downloaded.GetContentFile('image_codes_for_attn.npy')

downloaded = drive.CreateFile({'id': '1O7_3lyTyBMXsBBIt1PwUXwLdkyRQzZML'})
downloaded.GetContentFile('sources.txt')

downloaded = drive.CreateFile({'id': '1t-Dy8TzoRuTMoM7N9NJZKgWXfaw3b6KF'})
downloaded.GetContentFile('texts.txt')

!wget http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_Dataset.zip
!unzip Flickr8k_Dataset.zip

Скачаем модель сверточной сети (чтобы было)

In [0]:
from torchvision.models.inception import Inception3
from torch.utils.model_zoo import load_url


class BeheadedInception3(Inception3):
    """ Like torchvision.models.inception.Inception3 but the head goes separately """
    
    def forward(self, x):
        x = x.clone()
        x[:, 0] = x[:, 0] * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
        x[:, 1] = x[:, 1] * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
        x[:, 2] = x[:, 2] * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
        x = self.Conv2d_1a_3x3(x)
        x = self.Conv2d_2a_3x3(x)
        x = self.Conv2d_2b_3x3(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2)
        x = self.Conv2d_3b_1x1(x)
        x = self.Conv2d_4a_3x3(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2)
        x = self.Mixed_5b(x)
        x = self.Mixed_5c(x)
        x = self.Mixed_5d(x)
        x = self.Mixed_6a(x)
        x = self.Mixed_6b(x)
        x = self.Mixed_6c(x)
        x = self.Mixed_6d(x)
        x = self.Mixed_6e(x)
        x = self.Mixed_7a(x)
        x = self.Mixed_7b(x)
        x_for_attn = x = self.Mixed_7c(x)
        # 8 x 8 x 2048
        x = F.avg_pool2d(x, kernel_size=8)
        # 1 x 1 x 2048
        x_for_capt = x = x.view(x.size(0), -1)
        # 2048
        x = self.fc(x)
        # 1000 (num_classes)
        return x_for_attn, x_for_capt, x
    

inception_model = BeheadedInception3()

inception_url = 'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth'
inception_model.load_state_dict(load_url(inception_url))

inception_model.eval()

Загрузим данные:

In [0]:
target_field = Field(init_token=BOS_TOKEN, eos_token=EOS_TOKEN)
image_indices_field = Field(sequential=False, use_vocab=False)

fields = [('target', target_field), ('image_index', image_indices_field)]

Чтобы не уткнуться в лимит по памяти - используем memmap формат:

In [0]:
with open('sources.txt') as f_sources:
    image_paths = [line.strip() for line in f_sources]
    
image_tensors = np.memmap('image_codes_for_attn.npy', shape=(8091, 2048, 8, 8), dtype=np.float32)

examples = []
with open('texts.txt') as f_texts:
    for image_ind, texts in enumerate(f_texts):
        for text in texts.split('\t'):
            examples.append(Example.fromlist([target_field.preprocess(text), image_ind], fields))

In [0]:
dataset = Dataset(examples, fields)

train_dataset, test_dataset = dataset.split(split_ratio=0.85)

print('Train size =', len(train_dataset))
print('Test size =', len(test_dataset))

target_field.build_vocab(train_dataset, min_freq=2)
print('Target vocab size =', len(target_field.vocab))

train_iter, test_iter = BucketIterator.splits(
    datasets=(train_dataset, test_dataset), batch_sizes=(16, 64), shuffle=True, device=DEVICE, sort=False
)

**Задание** Реализуйте почти такие же модели как для перевода, чтобы научиться подписывать картинки:

In [0]:
class AdditiveAttention(nn.Module):
    def __init__(self, query_size, key_size, hidden_dim):
        super().__init__()
        
        self._query_layer = nn.Linear(query_size, hidden_dim)
        self._key_layer = nn.Linear(key_size, hidden_dim)
        self._energy_layer = nn.Linear(hidden_dim, 1)
        
    def forward(self, query, key_proj, value):
        ...

In [0]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, cnn_feature_size, emb_dim=128, rnn_hidden_dim=256, attn_dim=128, num_layers=1):
        super().__init__()

        self._emb = nn.Embedding(vocab_size, emb_dim)
        self._attention = AdditiveAttention(query_size=rnn_hidden_dim, key_size=cnn_feature_size, hidden_dim=attn_dim)
        
        self._cnn_to_h0 = nn.Linear(cnn_feature_size, rnn_hidden_dim)
        self._cnn_to_c0 = nn.Linear(cnn_feature_size, rnn_hidden_dim)
        self._rnn = nn.LSTM(input_size=emb_dim + cnn_feature_size, hidden_size=rnn_hidden_dim, num_layers=num_layers)
        self._out = nn.Linear(rnn_hidden_dim, vocab_size)

    def forward(self, encoder_output, inputs, hidden=None):
        embs = self._emb(inputs)
        
        seq_len, batch_size = inputs.shape[:2]
        
        encoder_output = encoder_output.view(batch_size, encoder_output.shape[1], -1).permute(0, 2, 1)
        
        if hidden is None:
            hidden = self.init_hidden(encoder_output)
        
        outputs, attentions = [], []
        <make forward pass with attention>
    
        return self._out(output), hidden, attentions
    
    def init_hidden(self, encoder_output):
        encoder_output = encoder_output.mean(dim=1)
        h0 = self._cnn_to_h0(encoder_output)
        c0 = self._cnn_to_c0(encoder_output)
        
        return h0.unsqueeze(0), c0.unsqueeze(0)

In [0]:
def do_epoch(model, criterion, data_iter, optimizer=None, name=None):
    epoch_loss = 0
    
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    batches_count = len(data_iter)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batches_count) as progress_bar:
            for i, batch in enumerate(data_iter):  
                encoder_output = FloatTensor(image_tensors[batch.image_index])
                logits, _, _ = model(encoder_output, batch.target)
                
                target = torch.cat((batch.target[1:], batch.target.new_ones((1, batch.target.shape[1]))))
                loss = criterion(logits.view(-1, logits.shape[-1]), target.view(-1))

                epoch_loss += loss.item()

                if optimizer:
                    optimizer.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), 1.)
                    optimizer.step()

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}, PPX = {:.2f}'.format(name, loss.item(), 
                                                                                         math.exp(loss.item())))
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}, PPX = {:.2f}'.format(
                name, epoch_loss / batches_count, math.exp(epoch_loss / batches_count))
            )
            progress_bar.refresh()

    return epoch_loss / batches_count


def fit(model, criterion, optimizer, train_iter, epochs_count=1, val_iter=None):
    best_val_loss = None
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss = do_epoch(model, criterion, train_iter, optimizer, name_prefix + 'Train:')
        
        if not val_iter is None:
            val_loss = do_epoch(model, criterion, val_iter, None, name_prefix + '  Val:')

In [0]:
model = Decoder(vocab_size=len(target_field.vocab), cnn_feature_size=image_tensors.shape[1]).to(DEVICE)

pad_idx = target_field.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(DEVICE)

optimizer = optim.Adam(model.parameters())

fit(model, criterion, optimizer, train_iter, epochs_count=30, val_iter=test_iter)

**Задание** Напишите цикл генерации по картинке:

In [0]:
def generate(image_tensor):
    bos_index = target_field.vocab.stoi[BOS_TOKEN]
    eos_index = target_field.vocab.stoi[EOS_TOKEN]

    words, attentions = [], []
    
    model.eval()
    with torch.no_grad():
        ...
        
    return words, attentions

In [0]:
def visualize(image_tensors, image_paths, image_index):
    words, attentions = generate(image_tensors[image_index])

    figure = plt.figure(figsize=(15, 20))

    image_path = image_paths[image_index]
    image = plt.imread('Flicker8k_Dataset/' + image_path)
    image = imresize(image, (299, 299)).astype('float32') / 255.

    for ind, (word, attention) in enumerate(zip(words, attentions)):
        ax = figure.add_subplot(np.ceil(len(words) / 3.), 3, ind + 1)

        ax.text(0, 1, word, color='black', backgroundcolor='white', fontsize=12)
        ax.imshow(image)

        alpha = imresize(1 - attention, (192, 192))

        ax.imshow(alpha, alpha=0.7)

        ax.axis('off')

Визуализируем это!

In [0]:
visualize(test_dataset.examples[0].image_index)

# Дополнительные материалы

## Статьи
Neural Machine Translation by Jointly Learning to Align and Translate, Bahdanau, 2014 [[pdf]](https://arxiv.org/pdf/1409.0473.pdf)  
Effective Approaches to Attention-based Neural Machine Translation, Luong, 2015 [[arxiv]](http://arxiv.org/abs/1508.04025)  
Show, Attend and Tell: Neural Image Caption Generation with Visual Attention, Xu, 2015 [[arxiv]](https://arxiv.org/abs/1502.03044)

## Блоги
[Attention and Augmented Recurrent Neural Networks](https://distill.pub/2016/augmented-rnns/)  
[Deep Learning for NLP Best Practices, Attention](http://ruder.io/deep-learning-nlp-best-practices/index.html#attention)  
[Attention? Attention!](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html)  
[Multi-Modal Methods: Image Captioning (From Translation to Attention)](https://medium.com/mlreview/multi-modal-methods-image-captioning-from-translation-to-attention-895b6444256e)  

## Видео
[Attention в Deep Learning и машинный перевод в очень широком смысле](https://www.youtube.com/watch?v=k63pDjKV3Ew)

# Сдача

[Форма для сдачи](https://goo.gl/forms/RnQN6UrGKdxPxPBG3)  
[Feedback](https://goo.gl/forms/9aizSzOUrx7EvGlG3)