## Задание
Реализуйте задачу машинного перевода с использованием transformer.
 [Датасет](http://www.manythings.org/anki/)

In [1]:
from io import open
import unicodedata
import string
import re
import random
import math
import numpy as np
from collections import Counter

import torch as tr
import torch.nn as nn
from torch import Tensor
from torch.nn import Transformer, TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F

from timeit import default_timer as timer
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

device = tr.device("cuda" if tr.cuda.is_available() else "cpu") 
device

device(type='cuda')

In [2]:
!wget https://www.manythings.org/anki/rus-eng.zip
!unzip rus-eng.zip

--2022-08-27 08:18:30--  https://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14819554 (14M) [application/zip]
Saving to: ‘rus-eng.zip’


2022-08-27 08:18:31 (17.1 MB/s) - ‘rus-eng.zip’ saved [14819554/14819554]

Archive:  rus-eng.zip
  inflating: rus.txt                 
  inflating: _about.txt              


In [3]:
SRC_LANGUAGE = 'eng'
TGT_LANGUAGE = 'rus'

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3}
        self.index2word = {0: '<unk>', 1: '<pad>', 2: '<bos>', 3: '<eos>'}
        self.word2count = {}
        self.n_words = 4 

    def addSentence(self, sentence):
        for word in sentence:#.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яА-ЯёЁ.!?]+", r" ", s)
    return s

In [9]:
def readLangs(s_limit):
    print("Reading lines...")
    lines = open('rus.txt', encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in re.split('\t', l)] for l in lines]
    pairs_list = []
    for string in pairs:
        eng_split =  [i for i in re.split('[\.!?]', string[0]) if i != '']
        esp_split =  [i for i in re.split('[\.!?]', string[1]) if i != '']    
        if len(eng_split) ==  len(esp_split):
            for i in range(len(eng_split)):            
                eng_s = re.findall('[\w]+', eng_split[i])[:s_limit]
                esp_s = re.findall('[\w]+', esp_split[i])[:s_limit]
                pairs_list.append([eng_s, esp_s]) 
    return pairs_list

In [10]:
def prepareData(i_lang, o_lang, se_limit):
    pairs = readLangs(se_limit)
    inp_lang  = Lang(i_lang)
    out_lang  = Lang(o_lang)
    print("Read %s sentence pairs" % len(pairs))
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        inp_lang.addSentence(pair[0])
        out_lang.addSentence(pair[1])
    print("Counted words:")
    print(inp_lang.name, inp_lang.n_words)
    print(out_lang.name, out_lang.n_words)
    return inp_lang, out_lang, pairs

In [11]:
def Prepare_Filt_Data(im_lang = SRC_LANGUAGE, ou_lang = TGT_LANGUAGE, 
                      sen_limit = 20,  w_limit = 5):
    i1_lang, o1_lang, pairs=prepareData(im_lang, ou_lang, sen_limit)
    inp_lang_  = Lang(im_lang)
    out_lang_  = Lang(ou_lang)

    eng_words_select = set([i for i in i1_lang.word2count.keys() 
                               if i1_lang.word2count[i] > w_limit])
    esp_words_select = set([i for i in o1_lang.word2count.keys() 
                               if o1_lang.word2count[i] > w_limit])
    
    for pair in pairs:
        for wrd_i in pair[0]:
            if wrd_i in eng_words_select:
                inp_lang_.addWord(wrd_i)
        for wrd_o in pair[1]:
            if wrd_o in esp_words_select:
                out_lang_.addWord(wrd_o)

    print('Using filter: every dict word in sentenses > ', w_limit)
    print(inp_lang_.name, inp_lang_.n_words)
    print(out_lang_.name, out_lang_.n_words)

    pairs_select = []
    for i in range(len(pairs)):
        if all(word in eng_words_select for word in pairs[i][0]):
            if all(word in esp_words_select for word in pairs[i][1]):
                   pairs_select.append([pairs[i][0], pairs[i][1]]) 

    print(len(pairs_select), 'sentenses')
    return inp_lang_, out_lang_, pairs_select

In [12]:
input_lang, output_lang, pairs = Prepare_Filt_Data(sen_limit = 20, w_limit   = 5)

Reading lines...
Read 446069 sentence pairs
Trimmed to 446069 sentence pairs
Counting words...
Counted words:
eng 16569
rus 56210
Using filter: every dict word in sentenses >  5
eng 7050
rus 16041
379269  sentenses


In [14]:
full_len = len(pairs)
train_list = list(range(full_len))
random.shuffle(train_list)

In [15]:
MAX_LENGTH = 22

def sent_to_torch(sent_in, l):
    sent_for_torch = np.zeros((MAX_LENGTH))
    for b in range(len(sent_in) + 2):
        if b == 0:
            sent_for_torch[b] = 2
        elif b <= len(sent_in):
            sent_for_torch[b] = l.word2index[sent_in[b-1]]
        elif (b==len(sent_in) + 1 and b <= MAX_LENGTH) or b == MAX_LENGTH+1:
            sent_for_torch[b] = 3
    return sent_for_torch

In [16]:
def torch_to_sent(sent_tens, l):
  sentence = [l.index2word(i) for i in sent_tens if i  > 3]
  return ' '.join(sentence)

In [17]:
def get_batch(pairs_num_list, batch_size):
  batch_list = random.sample(pairs_num_list, batch_size)

  s_np_in  = np.zeros((22))
  s_np_out = np.zeros((22))

  for a in range(batch_size):
    sent_in = pairs[batch_list[a]][0]
    sent_in = sent_to_torch(sent_in, input_lang)
    s_np_in = np.vstack([s_np_in, sent_in])
       
    sent_out = pairs[batch_list[a]][1]
    sent_out = sent_to_torch(sent_out, output_lang)
    s_np_out = np.vstack([s_np_out, sent_out])

  data   = tr.tensor( s_np_in[1:], dtype=tr.long, device=device).view(-1, batch_size)
  target = tr.tensor(s_np_out[1:], dtype=tr.long, device=device).view(-1)

  return data, target

## Transformer

In [18]:
class TransformerModel(nn.Module):

    def __init__(self, n_token_in, n_token_out, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()

        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(n_token_in, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, n_token_out)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (tr.triu(tr.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [19]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = tr.zeros(max_len, d_model)
        position = tr.arange(0, max_len, dtype=tr.float).unsqueeze(1)
        div_term = tr.exp(tr.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = tr.sin(position * div_term)
        pe[:, 1::2] = tr.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [20]:
n_tokens_in = input_lang.n_words
n_tokens_out= output_lang.n_words

In [21]:
emsize = 200   
nhid = 200    
nlayers = 2     
nhead = 2    
dropout = 0.5 
model = TransformerModel(n_tokens_in, n_tokens_out, emsize, nhead, nhid, nlayers, dropout).to(device)

In [26]:
criterion = nn.CrossEntropyLoss()
lr = 5  
optimizer = tr.optim.SGD(model.parameters(), lr=lr)
scheduler = tr.optim.lr_scheduler.StepLR(optimizer, 3.0, gamma=0.8)
batch_size = 50
range_step = 150

def train():
  model.train()
  total_loss = 0.
  start_time = time.time()
  ntokens = n_tokens_out
  for i in range(range_step):
    data, targets = get_batch(train_list, batch_size)
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output.view(-1, ntokens), targets)
    loss.backward()
    tr.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

    total_loss += loss.item()
        
    if i in range(1, range_step, 20) and i > 0:
      cur_loss = total_loss /i
      elapsed = time.time() - start_time
      print(f'| epoch {epoch:3d} | lr { scheduler.get_last_lr()[0]:02.2f} | loss {cur_loss:5.6f} | ppl { math.exp(cur_loss):8.6f}')
    
  return total_loss / range_step, model

In [39]:
best_val_loss = float("inf")
epochs = 20
best_model = None

for epoch in range(1, epochs + 1):
  epoch_start_time = time.time()
  t_loss, model_   = train()
  print('-' * 55)
  print(f'|mean loss epoch   {epoch:3}|mean l {t_loss:5.6f}|vppl { math.exp(t_loss):4.6f}')
  print('-' * 55)

  if t_loss < best_val_loss:
      best_val_loss = t_loss
      best_model = model_

  scheduler.step()

| epoch   1 | lr 0.02 | loss 3.026301 | ppl 20.620815
| epoch   1 | lr 0.02 | loss 1.558009 | ppl 4.749357
| epoch   1 | lr 0.02 | loss 1.542652 | ppl 4.676975
| epoch   1 | lr 0.02 | loss 1.530898 | ppl 4.622326
| epoch   1 | lr 0.02 | loss 1.520751 | ppl 4.575662
| epoch   1 | lr 0.02 | loss 1.517427 | ppl 4.560475
| epoch   1 | lr 0.02 | loss 1.516093 | ppl 4.554396
| epoch   1 | lr 0.02 | loss 1.516448 | ppl 4.556014
-------------------------------------------------------
|mean loss epoch     1|mean l 1.504124|vppl 4.500210
-------------------------------------------------------
| epoch   2 | lr 0.02 | loss 3.138358 | ppl 23.065961
| epoch   2 | lr 0.02 | loss 1.579426 | ppl 4.852169
| epoch   2 | lr 0.02 | loss 1.534810 | ppl 4.640444
| epoch   2 | lr 0.02 | loss 1.534148 | ppl 4.637373
| epoch   2 | lr 0.02 | loss 1.539529 | ppl 4.662393
| epoch   2 | lr 0.02 | loss 1.538313 | ppl 4.656727
| epoch   2 | lr 0.02 | loss 1.537180 | ppl 4.651454
| epoch   2 | lr 0.02 | loss 1.531871 

In [40]:
best_model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        

In [41]:
def translate_to_spain(idx):
    print('Предложение на английском :', ' '.join(pairs[idx][0])) 
    print('Предложение на испанском  :', ' '.join(pairs[idx][1])) 

    sent_np_in = sent_to_torch(pairs[idx][0], input_lang)
    output     = best_model(tr.tensor(sent_np_in,  dtype=tr.long, device=device))

    print('Перевод seq2seq           :', 
          ' '.join([output_lang.index2word[i.item()] 
                    for i in output[0].data.topk(1)[1] if i > 3])) 

In [42]:
translate_to_spain(72422)

Предложение на английском : he is close to sixty
Предложение на испанском  : ему около шестидесяти
Перевод seq2seq           : он


In [43]:
translate_to_spain(11111)

Предложение на английском : tom is normal
Предложение на испанском  : том обычныи
Перевод seq2seq           : том


In [44]:
translate_to_spain(54863)

Предложение на английском : we re very curious
Предложение на испанском  : мы очень любопытны
Перевод seq2seq           : мы очень
