### Домашнее задание 7

Возьмите англо-русскую пару фраз (https://www.manythings.org/anki/)

Обучите на них seq2seq по аналогии с занятием. Оцените полученное качество

Попробуйте добавить +1 рекуррентный в encoder и decoder

Попробуйте заменить GRU ячейки на lstm-ячейки

Оцените качество во всех случаях

In [3]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
%matplotlib inline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
with open('eng-rus.txt', encoding='utf-8') as f:
    my_lines = f.readlines()

In [10]:
#my_lines

In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яА-ЯёЁ.!?]+", r" ", s)
    return s

In [5]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    for p in pairs:
        del p[2]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [14]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Reading lines...
Read 444587 sentence pairs
Trimmed to 25903 sentence pairs
Counting words...
Counted words:
rus 9706
eng 4150
['я занят подготовкои к следующему экзамену .', 'i m busy preparing for the next exam .']


In [15]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [16]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [17]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
    
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [18]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [19]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [20]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

#### Обучите на них seq2seq по аналогии с занятием. Оцените полученное качество

##### The Encoder

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

##### The Decoder

In [22]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [23]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 5000, print_every=250)

0m 3s (- 1m 4s) (250 5%) 4.0591
0m 5s (- 0m 49s) (500 10%) 3.5321
0m 7s (- 0m 43s) (750 15%) 3.4920
0m 9s (- 0m 38s) (1000 20%) 3.4322
0m 11s (- 0m 35s) (1250 25%) 3.2817
0m 13s (- 0m 32s) (1500 30%) 3.2136
0m 15s (- 0m 29s) (1750 35%) 3.1301
0m 18s (- 0m 27s) (2000 40%) 3.1665
0m 20s (- 0m 24s) (2250 45%) 3.1217
0m 22s (- 0m 22s) (2500 50%) 3.1019
0m 24s (- 0m 19s) (2750 55%) 3.0399
0m 26s (- 0m 17s) (3000 60%) 2.9474
0m 28s (- 0m 15s) (3250 65%) 3.0474
0m 30s (- 0m 13s) (3500 70%) 2.9607
0m 32s (- 0m 10s) (3750 75%) 2.9005
0m 35s (- 0m 8s) (4000 80%) 2.8845
0m 37s (- 0m 6s) (4250 85%) 2.8323
0m 40s (- 0m 4s) (4500 90%) 2.7986
0m 42s (- 0m 2s) (4750 95%) 2.8663
0m 44s (- 0m 0s) (5000 100%) 2.7934


In [24]:
evaluateRandomly(encoder1, decoder1)

> я же не умер ?
= i m not dead am i ?
< i m not sure that you re not . <EOS>

> мы не будем терять времени .
= we re not going to waste time .
< we re not going to <EOS>

> я чувствую холод .
= i m feeling cold .
< i m a . . <EOS>

> я фотограф любитель .
= i m a shutterbug .
< i m a . <EOS>

> ты к этому не готова .
= you aren t ready for this .
< you re not going to you <EOS>

> уверен что вы неправильно поняли сказанное .
= i m sure you misunderstood what was said .
< i m sure that you re the . . <EOS>

> я живу одна .
= i m living alone .
< i m a . . <EOS>

> я не пытаюсь ничего сказать .
= i m not trying to say anything .
< i m not going to tom . <EOS>

> я красивая .
= i m beautiful .
< i m a . <EOS>

> я смотрю на те цветы .
= i m looking at those flowers .
< i m going to to be . <EOS>



#### Попробуйте добавить +1 рекуррентный в encoder и decoder

##### The Encoder

In [25]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(2, 1, self.hidden_size, device=device)

##### The Decoder

In [26]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(2, 1, self.hidden_size, device=device)

In [27]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 5000, print_every=250)

0m 2s (- 0m 49s) (250 5%) 4.2825
0m 5s (- 0m 45s) (500 10%) 3.5881
0m 7s (- 0m 42s) (750 15%) 3.3510
0m 9s (- 0m 39s) (1000 20%) 3.2673
0m 12s (- 0m 36s) (1250 25%) 3.3078
0m 14s (- 0m 34s) (1500 30%) 3.2263
0m 17s (- 0m 31s) (1750 35%) 3.1572
0m 19s (- 0m 29s) (2000 40%) 3.0763
0m 22s (- 0m 27s) (2250 45%) 3.1297
0m 24s (- 0m 24s) (2500 50%) 2.9650
0m 27s (- 0m 22s) (2750 55%) 3.0434
0m 29s (- 0m 19s) (3000 60%) 3.0877
0m 32s (- 0m 17s) (3250 65%) 2.9989
0m 35s (- 0m 15s) (3500 70%) 2.8771
0m 37s (- 0m 12s) (3750 75%) 2.9191
0m 40s (- 0m 10s) (4000 80%) 2.9310
0m 42s (- 0m 7s) (4250 85%) 2.9233
0m 44s (- 0m 4s) (4500 90%) 2.8473
0m 47s (- 0m 2s) (4750 95%) 2.8835
0m 49s (- 0m 0s) (5000 100%) 2.7683


In [28]:
evaluateRandomly(encoder1, decoder1)

> мы ждем подходящего момента .
= we re waiting for the right moment .
< we re going to . . <EOS>

> по понедельникам я обычно дома .
= i m usually at home on mondays .
< i m going to a . . <EOS>

> мы двигаемся в правильном направлении .
= we re headed in the right direction .
< we re going to to . . <EOS>

> я рада что наконец с вами познакомилась .
= i m glad to finally meet you .
< i m sure to you to be . . <EOS>

> он еи в отцы годится .
= he is old enough to be her father .
< he is the a to . . . <EOS>

> я это полностью осознаю .
= i m fully aware of that .
< i m sure to be this . <EOS>

> тебе послышалось .
= you are hearing things .
< you re very to . . <EOS>

> вы богатыи человек .
= you re a rich man .
< you re very . . <EOS>

> я уверена том будет в восторге .
= i m sure tom will be thrilled .
< i m sure to this to this . <EOS>

> они плодятся как кролики .
= they re breeding like rabbits .
< we re going to to . . <EOS>



#### Попробуйте заменить GRU ячейки на lstm-ячейки

In [None]:
import csv

input_rus_txt = []
output_eng_txt = []

for pair in pairs:
    input_rus_txt.append(pair[0])
    output_eng_txt.append(pair[1])
    
with open('rus-eng/rus.txt', 'w') as f:
    for item in input_rus_txt:
        f.write("%s\n" % item)
        
with open('rus-eng/eng.txt', 'w') as f:
    for item in output_eng_txt:
        f.write("%s\n" % item)
        
import pandas as pd

df = pd.DataFrame(pairs)
df.to_csv("rus-eng/pairs_tsv.tsv", sep="\t", index=False)

In [1]:
# These are the standard torch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torchtext import data # Helper libraries
import random
from tqdm import tqdm

In [2]:
#from torchtext.data import Field, BucketIterator
import torchtext
from torchtext.data.utils import get_tokenizer

import spacy
import numpy as np

import random
import math
import time

In [4]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download de_core_news_sm
#!python -m spacy download ru_core_news_sm

In [6]:
spacy_ru = spacy.load('ru_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [10]:
def tokenize_ru(text):
    """
    Tokenizes Russian text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_ru.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [7]:
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

In [9]:
from torchtext.legacy.datasets import Multi30k
#from torchtext.data import Field, BucketIterator

In [11]:
SRC = Field(tokenize = tokenize_ru, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [13]:
rus_pair_txt = 'rus.txt'
en_pair_txt = 'eng.txt'

In [16]:
MAX_VOCAB_SIZE = 30_000
MIN_COUNT = 3
MAX_SEQUENCE_LENGTH = 15
BATCH_SIZE = 128 # Create Field object

# TEXT = data.Field(tokenize = 'spacy', lower=True, include_lengths = True, init_token = '<sos>', 
#             eos_token = '<eos>') # Specify Fields in our dataset

fields = [('input_sequence', SRC), ('output_sequence', TRG)] 

# .tsv containing the statement/response pairs
data_file = 'pairs_tsv.tsv' # Build the dataset

dialogue_data = TabularDataset(
    path=data_file, format='tsv',
    fields=fields)
    
# Build vocabulary, and include pretrained GLoVe vectors
SRC.build_vocab(dialogue_data,
                max_size=MAX_VOCAB_SIZE,
                min_freq=MIN_COUNT,
                vectors='glove.6B.300d',
                unk_init=torch.Tensor.normal_) # Split our dialogue data into training, validation, and test sets
train_data, test_data = dialogue_data.split()
train_data, valid_data = train_data.split()

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:34<00:00, 11565.46it/s]


In [17]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 12693
Number of validation examples: 5440
Number of testing examples: 7771


In [18]:
print(vars(train_data.examples[0]))

{'input_sequence': ['.', 'работои', 'своеи', 'доволен', 'он'], 'output_sequence': ['he', 'is', 'pleased', 'with', 'his', 'work', '.']}


In [19]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [20]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 3161
Unique tokens in target (en) vocabulary: 2016


In [21]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=False,
    device = device)

In [22]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [23]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [25]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [26]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(3161, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2016, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=2016, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,715,936 trainable parameters


In [28]:
optimizer = optim.Adam(model.parameters())

In [29]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [30]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.input_sequence
        trg = batch.output_sequence
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [31]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.input_sequence
            trg = batch.output_sequence

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [32]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
N_EPOCHS = 30
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 4s
	Train Loss: 3.767 | Train PPL:  43.248
	 Val. Loss: 3.330 |  Val. PPL:  27.941
Epoch: 02 | Time: 0m 4s
	Train Loss: 3.204 | Train PPL:  24.632
	 Val. Loss: 3.079 |  Val. PPL:  21.744
Epoch: 03 | Time: 0m 4s
	Train Loss: 2.878 | Train PPL:  17.785
	 Val. Loss: 2.862 |  Val. PPL:  17.489
Epoch: 04 | Time: 0m 6s
	Train Loss: 2.661 | Train PPL:  14.310
	 Val. Loss: 2.775 |  Val. PPL:  16.041
Epoch: 05 | Time: 0m 4s
	Train Loss: 2.480 | Train PPL:  11.936
	 Val. Loss: 2.633 |  Val. PPL:  13.922
Epoch: 06 | Time: 0m 4s
	Train Loss: 2.309 | Train PPL:  10.062
	 Val. Loss: 2.483 |  Val. PPL:  11.982
Epoch: 07 | Time: 0m 4s
	Train Loss: 2.161 | Train PPL:   8.676
	 Val. Loss: 2.425 |  Val. PPL:  11.303
Epoch: 08 | Time: 0m 4s
	Train Loss: 2.032 | Train PPL:   7.630
	 Val. Loss: 2.306 |  Val. PPL:  10.033
Epoch: 09 | Time: 0m 4s
	Train Loss: 1.902 | Train PPL:   6.699
	 Val. Loss: 2.262 |  Val. PPL:   9.601
Epoch: 10 | Time: 0m 4s
	Train Loss: 1.803 | Train PPL:   6.069


In [34]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 1.682 | Test PPL:   5.376 |


0m 44s (- 0m 0s) (5000 100%) 2.7934 - loss при обучении одним слоем GRU

0m 49s (- 0m 0s) (5000 100%) 2.7683 - loss при обучении двухслойным GRU в enc и dec