In [2]:
!pip install pymorphy2==0.8
!git clone -b v2.1 https://github.com/buriy/spacy-ru.git && cp -r ./spacy-ru/ru2/. ./ru2

Collecting pymorphy2==0.8
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |███████                         | 10kB 23.7MB/s eta 0:00:01[K     |██████████████▏                 | 20kB 4.7MB/s eta 0:00:01[K     |█████████████████████▎          | 30kB 6.4MB/s eta 0:00:01[K     |████████████████████████████▍   | 40kB 6.4MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.3MB/s 
[?25hCollecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 17.6MB/s 
[?25hCollecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-

In [0]:
import spacy
import torch
import torch.nn as nn

from torchtext.data import Field, BucketIterator
from torchtext.datasets import TranslationDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
en_lang = spacy.load('en')
ru_lang = spacy.load('ru2')

In [0]:
def tokenize_ru(sentence):
    return [tok.text for tok in ru_lang.tokenizer(sentence)]

def tokenize_en(sentence):
    return [tok.text for tok in en_lang.tokenizer(sentence)]

ru = Field(tokenize=tokenize_ru, init_token='<sos>', eos_token='<eos>', lower=True)
en = Field(tokenize=tokenize_en, init_token='<sos>', eos_token = '<eos>', lower = True)

In [7]:
%%time
dataset = TranslationDataset("drive/My Drive/IU_course_files/PMLDL/HW4/corpus.en_ru.150t", 
                             exts=('.ru', '.en'),
                             fields=(ru, en))

CPU times: user 2min 47s, sys: 1.13 s, total: 2min 48s
Wall time: 2min 54s


In [0]:
train_data, test_data, valid_data = dataset.split(split_ratio=[0.7, 0.2, 0.1],
                                                  stratified=False)

In [0]:
ru.build_vocab(train_data, min_freq=2)
en.build_vocab(train_data, min_freq=2)

In [0]:
import dill

with open("drive/My Drive/IU_course_files/PMLDL/HW4/ru.Field", "rb") as fd:
    ru = dill.load(fd)

with open("drive/My Drive/IU_course_files/PMLDL/HW4/en.Field", "rb") as fd:
    en = dill.load(fd)

In [7]:
len(ru.vocab), len(en.vocab)

(78871, 35399)

In [0]:
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

In [10]:
import random
from typing import Tuple

import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor


class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor) -> Tuple[Tensor]:

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden


class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tuple[Tensor]:

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)


class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs


INPUT_DIM = len(ru.vocab)
OUTPUT_DIM = len(en.vocab)
ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


model.apply(init_weights)
optimizer = optim.Adam(model.parameters())

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,712,239 trainable parameters


In [0]:
PAD_IDX = en.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [0]:
import math
import time


def train(model: nn.Module,
          iterator: BucketIterator,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        print(f"\rBatch [{i}/{len(iterator)}]", end='')
        src, trg = batch.src, batch.trg
    
        optimizer.zero_grad()
        output = model(src, trg)
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
    
    print()
    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for _, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

    if epoch + 1 % 2 == 0:
        torch.save(model.state_dict(),
            f"drive/My Drive/IU_course_files/PMLDL/HW4/model_epoch{epoch + 1}.pth")

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Batch [3281/3282]
Epoch: 01 | Time: 51m 3s
	Train Loss: 6.536 | Train PPL: 689.203
	 Val. Loss: 6.625 |  Val. PPL: 753.363
Batch [3281/3282]
Epoch: 02 | Time: 51m 12s
	Train Loss: 6.066 | Train PPL: 431.163
	 Val. Loss: 6.257 |  Val. PPL: 521.711
Batch [1387/3282]

KeyboardInterrupt: ignored

In [11]:
model.load_state_dict(torch.load("drive/My Drive/IU_course_files/PMLDL/HW4/model_epoch2.pth"))

<All keys matched successfully>

In [12]:
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(78871, 32)
    (rnn): GRU(32, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=8, bias=True)
    )
    (embedding): Embedding(35399, 32)
    (rnn): GRU(160, 64)
    (out): Linear(in_features=224, out_features=35399, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [64]:
text = 'большой мост'
trg = ['<sos>']

tokens = tokenize_ru(text.lower())
tokens_ids = [ru.vocab.stoi[token] for token in tokens]
trg_ids = [en.vocab.stoi[token] for token in trg]
print(tokens)
tensor_view = torch.tensor(tokens_ids, dtype=torch.long).unsqueeze(1).to(device)
print(tensor_view)

encoder_outputs, hidden = model.encoder(tensor_view)

output = torch.tensor(trg_ids).unsqueeze(1).to(device)[0,:]
stop = False
i = 0
while i < 10:
    output, hidden = model.decoder(output, hidden, encoder_outputs)
    output = output.max(1)[1]
    print(en.vocab.itos[output])

    if en.vocab.itos[output] == '<eos>':
        break

['большой', 'мост']
tensor([[ 469],
        [8160]], device='cuda:0')
the
<unk>
in
the
<unk>
,
the
the
<unk>
,
the
the
<unk>
,
the
<unk>
,
the
<unk>
.
<eos>


----

In [0]:
data_path_ru = "drive/My Drive/IU_course_files/PMLDL/HW4/corpus.en_ru.1m.ru"
data_path_en = "drive/My Drive/IU_course_files/PMLDL/HW4/corpus.en_ru.1m.en"

data_path_ru_small = "drive/My Drive/IU_course_files/PMLDL/HW4/corpus.en_ru.150t.ru"
data_path_en_small = "drive/My Drive/IU_course_files/PMLDL/HW4/corpus.en_ru.150t.en"

In [0]:
import numpy as np

with open(data_path_ru) as fd_ru:
    with open(data_path_en) as fd_en:
        ru_sents = fd_ru.readlines()
        en_sents = fd_en.readlines()

        with open(data_path_ru_small, 'w') as fd_ru_s:
            fd_ru_s.write('\n'.join(ru_sents[100000:250000]))
        with open(data_path_en_small, 'w') as fd_en_s:
            fd_en_s.write('\n'.join(en_sents[100000:250000]))