## Install specific requirements
(once you did it - you can skip it)

In [None]:
# Update torchtext
!pip install torchtext -U
# Install YouTokenToMe for tokenization
!pip install youtokentome

unpack .zip from https://translate.yandex.ru/corpus?lang=en in root folder of project

## Imports

In [2]:
%matplotlib inline

import time
from tqdm import tqdm
import os

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import youtokentome as yttm

vocab_size = 32000
PADDING_TOKEN = 0
UNK_TOKEN = 1
BOS_TOKEN = 2
EOS_TOKEN = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# dataset
path = 'corpus.en_ru.1m'
tokenizer_path = f'{path}_v{vocab_size}.tokenizer'

# Model save path.
model_save_path = "model.pth"


def load_files(path):
    res = ([], [])
    for i, ext in enumerate(['.en', '.ru']):
        with open(path + ext, encoding='utf-8') as in_file:
            res[i].extend(in_file.readlines())
    return res

## Prepare data
(once you did it - you can skip it)

In [3]:
data_en, data_ru = load_files(path)

raw_data = {'English' : [line for line in data_en], 'Russian': [line for line in data_ru]}

df = pd.DataFrame(raw_data, columns=list(raw_data.keys()))
df.shape

(1000000, 2)

In [4]:
df['en_len'] = df['English'].str.count(' ')
df['ru_len'] = df['Russian'].str.count(' ')
df.sort_values(['ru_len', 'en_len'], ascending=[True, True], inplace=True)
del df['en_len'], df['ru_len']
df.shape

(1000000, 2)

In [5]:
df = df[:50000]
df.shape

(50000, 2)

In [6]:
# Create train, test, val sets.
train, test = train_test_split(df, test_size=0.2)
test, val = train_test_split(test, test_size=0.5)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
val.to_csv('val.csv', index=False)

## Load data

In [7]:
if os.path.exists(tokenizer_path):
    tokenizer = yttm.BPE(model=tokenizer_path)
else:
    # Create temp file with data to train tokenizer.
    data_en, data_ru = load_files(path)
    temp_file_path = 'tokenizer_text.temp'
    with open(temp_file_path, 'w', encoding='utf8') as out_file:
        out_file.write('\n'.join(map(str.lower, data_en)))
        out_file.write('\n'.join(map(str.lower, data_ru)))
    # Train tokenizer.
    tokenizer = yttm.BPE.train(data=temp_file_path, vocab_size=vocab_size, model=tokenizer_path)
    os.remove(temp_file_path)
print('Vocab size:', tokenizer.vocab_size())

Vocab size: 32000


## Dataloaders

In [9]:
class TextDataset(torch.utils.data.Dataset):

    __output_types = { 'id': yttm.OutputType.ID,
                       'subword':yttm.OutputType.SUBWORD }

    def __init__(self, csv_file, tokenizer, max_len=60, max_len_ratio=1.5):
        self.tokenizer = tokenizer
        df = pd.read_csv(csv_file)
        # Tokenize sentences using tokenizer.
        tokenize_lambda = lambda x: self.tokenize(x.lower().strip(), 'subword')
        df['eng_enc'] = df.English.apply(tokenize_lambda)
        df['rus_enc'] = df.Russian.apply(tokenize_lambda)
        # Delete sentences that exceed the max length and max length ratio.
        df['en_len'] = df['eng_enc'].str.len()
        df['ru_len'] = df['rus_enc'].str.len()
        df.query(f'ru_len < {max_len} & en_len < {max_len}', inplace=True)
        df.query(f'ru_len < en_len * {max_len_ratio} & ru_len * {max_len_ratio} > en_len', inplace=True)
        # Sort the values for less padding in batching.
        df.sort_values(['ru_len', 'en_len'], ascending=[False, False], inplace=True)
        # TODO: better unpacking
        raw_src, raw_tgt = zip(df[['Russian', 'English']].T.values)
        src, tgt = zip(df[['rus_enc', 'eng_enc']].T.values)
        self.tgt, self.src = tgt[0], src[0]
        self.raw_src, self.raw_tgt = raw_src[0], raw_tgt[0]
        

    def tokenize(self, s, output_type='id'):
        """Tokenize the sentence.
        :param s: the sentence to tokenize
        :param output_type: either 'id' or 'subword' for corresponding output
        :return: tokenized sentence"""
        return self.tokenizer.encode(s, output_type=self.__output_types[output_type],
                                bos=True, eos=True)
    def decode(self, tokens):
        return self.tokenizer.id_to_subword(tokens)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        src = self.src[idx]
        src = [self.tokenizer.subword_to_id(token) for token in src]
        tgt = self.tgt[idx]
        tgt = [self.tokenizer.subword_to_id(token) for token in tgt]
        return src, tgt

def load_datasets(tokenizer, ext='.csv'):
    res = []
    for name in  ['train', 'val', 'test']:
        dataset_path = name + ext
        res.append(TextDataset(dataset_path, tokenizer))
    return res

train_data, val_data, test_data = load_datasets(tokenizer)
print('Train:', len(train_data),
      '\nVal:', len(val_data),
      '\nTest:', len(test_data))

Train: 34796 
Val: 4374 
Test: 4352


In [10]:
def my_collate(batch):
    src, tgt = zip(*batch)
    src = [Tensor(s) for s in src]
    tgt = [Tensor(t) for t in tgt]
    # TODO: Generalize padding value
    src = pad_sequence(src, batch_first=True, padding_value=PADDING_TOKEN).long()
    tgt = pad_sequence(tgt, batch_first=True, padding_value=PADDING_TOKEN).long()
    return src.t(), tgt.t()

def make_dataloaders(datasets, batch_size, num_workers=0):
    res = []
    for dataset in datasets:
        res.append(DataLoader(dataset, batch_size=batch_size,
                        shuffle=False, num_workers=num_workers, collate_fn=my_collate))
    return res

(train_iterator,
 val_iterator,
 test_iterator) = make_dataloaders([train_data, val_data, test_data],
                                   batch_size=8,
                                   num_workers=0)

data_iterators = {
    'train': train_iterator,
    'val': val_iterator,
    'test': test_iterator,
}

## Model

In [0]:
# Source of origin: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
    
    def create_pe(self, seq_len):
        pe = torch.zeros(seq_len, self.d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

    def forward(self, x):
        pe = self.create_pe(x.size(0))
        x = x + pe.to(x.device)
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntokens_src, ntokens_tgt, ninp, nhead, dim_feedforward, nlayers, pad_token, dropout=0.1):
        super(TransformerModel, self).__init__()
        from torch.nn import Transformer
        self.model_type = 'Transformer'
        self.ninp = ninp
        self.pad_token = pad_token
        self.masks = {
            'src': None,
            'tgt': None,
            'memory': None,
        }
        # Token Encoders
        self.src_encoder = nn.Embedding(ntokens_src, ninp)
        self.tgt_encoder = nn.Embedding(ntokens_tgt, ninp)
        # Positional Encoding
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        # Transformer
        self.transformer = Transformer(
            d_model=ninp,
            nhead=nhead,
            num_encoder_layers=nlayers,
            num_decoder_layers=nlayers,
            dropout=dropout,
            dim_feedforward=dim_feedforward,
        )
        self.out = nn.Linear(ninp, ntokens_tgt)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sx, sy=None):
        sy = sy or sx
        mask = (torch.triu(torch.ones((sx, sy))) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        self.transformer._reset_parameters()
    
    def preprocess(self, x, x_type):
        # Create masks
        padding_mask = (x == self.pad_token).bool().t()
        if self.masks[x_type] is None or self.masks[x_type].size(0) != len(x):
            self.masks[x_type] = self._generate_square_subsequent_mask(len(x), len(x)).to(x.device)
        
        x_enc = self.src_encoder(x) if x_type == 'src' else self.tgt_encoder(x)
        x_enc *= math.sqrt(self.ninp) # TODO: * or / or remove?
        x_enc = self.pos_encoder(x_enc)
        
        return x_enc, self.masks[x_type], padding_mask
        
    def forward(self, src, tgt):

        if (    self.masks['memory'] is None or
                self.masks['src'].size(0) != len(src) or
                self.masks['tgt'].size(0) != len(tgt)):
            self.masks['memory'] = self._generate_square_subsequent_mask(len(src), len(tgt)).to(src.device)
        
        src_enc, _, src_key_padding_mask = self.preprocess(src, 'src')
        tgt_enc, _, tgt_key_padding_mask = self.preprocess(tgt, 'tgt')
        memory_key_padding_mask = src_key_padding_mask.clone().detach()
        
        output = self.transformer(src_enc, tgt_enc,
                                  src_mask=self.masks['src'],
                                  tgt_mask=self.masks['tgt'],
                                  memory_mask=self.masks['memory'],
                                  src_key_padding_mask=src_key_padding_mask,
                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                  memory_key_padding_mask=memory_key_padding_mask,
                                  )
        output = self.out(output)
        return output

In [0]:
def run_model(model, criterion, optimizer, data_iterator, is_train_phase, n_words=1, desc=''):
    if is_train_phase:
        model.train()
    else:
        model.eval()
    total_loss = 0.0
    pbar = tqdm(total=len(data_iterator), desc=desc)
    for i, (src, tgt) in enumerate(data_iterator):
        src, tgt = src.to(device), tgt.to(device)
        
        tgt_losses = 0.0
        for j in range(max(1, len(tgt) - n_words), len(tgt)):
            optimizer.zero_grad()
            tgt_in = tgt[:j, :]
            tgt_out = tgt[1:j+1, :]
            
            with torch.set_grad_enabled(is_train_phase):
                output = model(src, tgt_in).transpose(1, 2)
                loss = criterion(output, tgt_out)

                if is_train_phase:
                    loss.backward()
                    # Clip gradient to deal with gradient explosion
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
                    optimizer.step()
            tgt_losses += loss.item()
        total_loss += tgt_losses / j
        pbar.update(1)
        pbar.set_description(desc + f'- loss: {total_loss / (i+1):7.4}')
    return total_loss / (i+1)

def train_model(model, n_epochs, data_iterators,
                criterion, optimizer, n_words=1, scheduler=None, model_save_path=None):
    stats = {'train':{'loss':[]},
             'val':{'loss':[]}}
    best_loss = None
    
    for epoch in range(n_epochs):
        lr = optimizer.state_dict()['param_groups'][0]['lr']
        print(f'------------ Epoch {epoch}; lr: {lr:.5f} ------------')
        for phase in ['train', 'val']:
            desc = f'{phase.title()} Epoch #{epoch} '
            epoch_loss = run_model(model, criterion, optimizer,
                                   data_iterators[phase], phase == 'train',
                                   n_words, desc)
            stats[phase]['loss'].append(epoch_loss)
            print_hist = lambda l: ' -> '.join(map(lambda x:f"{x:.4}", l[-2:]))
            tqdm.write(f'{phase.title()} Loss: ' + print_hist(stats[phase]['loss']))
        if best_loss == None or stats['val']['loss'][-1] < best_loss:
            best_loss = stats['val']['loss'][-1]
            print('Smallest val loss')
            if model_save_path:
                print('Saving model...')
                torch.save(model, model_save_path)
        translate(model, 'Машинное обучение это здорово!', verbose=True)
        scheduler.step()
    return stats

In [0]:
ntokens_src = tokenizer.vocab_size() # the size of vocabulary
ntokens_tgt = tokenizer.vocab_size() # the size of vocabulary
emsize = 512 # embedding dimension
nhid = 1024 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 5 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 8 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
model = TransformerModel(ntokens_src, ntokens_tgt, emsize, nhead, nhid, nlayers, PADDING_TOKEN, dropout).to(device)

In [0]:
# Ignore padding index during the loss computation.
criterion = nn.CrossEntropyLoss(ignore_index=PADDING_TOKEN, reduction='mean')
lr = 0.8
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.97)

In [0]:
torch.cuda.empty_cache()
n_epochs = 2
n_words = 1
stats = train_model(model, n_epochs, data_iterators,
                    criterion, optimizer, n_words, scheduler, model_save_path)

------------ Epoch 0; lr: 0.77600 ------------


HBox(children=(FloatProgress(value=0.0, description='Train Epoch #0 ', max=4002.0, style=ProgressStyle(descrip…

Train Loss: 0.466


HBox(children=(FloatProgress(value=0.0, description='Val Epoch #0 ', max=502.0, style=ProgressStyle(descriptio…

Val Loss: 0.6473
Smallest val loss
Saving model...


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Saved successfully
------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁not': 0.009476497769355774, '▁it': 0.006838961038738489, '▁other': 0.006023898255079985}
  1 {'▁so': 0.013619394041597843, '▁not.': 0.013337705284357071, '▁very': 0.011635693721473217}
  2 {'▁not.': 0.04119184985756874, '▁well.': 0.028293495997786522, '▁possible.': 0.022640258073806763}
translation: <BOS> not so not.<EOS>
------------ Epoch 1; lr: 0.75272 ------------


HBox(children=(FloatProgress(value=0.0, description='Train Epoch #1 ', max=4002.0, style=ProgressStyle(descrip…

In [0]:
def subword_to_str(tokens):
    return ''.join(tokens).replace('▁', ' ')

def tokens_to_str(tokens):
    return subword_to_str([tokenizer.id_to_subword(ix) for ix in tokens])

def translate(model, text, max_len=80, custom_string=False, verbose=False):
    model.eval()
    
    if verbose:
        print('------------ Translation ------------')
        print('Input:', text)
    # Prepare text
    src = tokenizer.encode(text, output_type=yttm.OutputType.ID,
                           bos=True, eos=True)
    src = Tensor(src).long().to(device)
    # Run encoder
    src_enc, src_mask, _ = model.preprocess(src, 'src')
    e_outputs = model.transformer.encoder(src_enc, 
                                          src_mask,
                                          )
    
    # Prepare tensor for answers
    outputs = torch.zeros(max_len).type_as(src.data)
    # Set the first token as '<sos>'
    outputs[0] = torch.LongTensor([BOS_TOKEN])
    vals = []
    for i in range(1, max_len):
        outputs_enc, tgt_mask, _ = model.preprocess(outputs[:i].unsqueeze(1), 'tgt')
        d_out = model.transformer.decoder(outputs_enc, e_outputs,
                                          tgt_mask=tgt_mask,
                                          )
        out = model.out(d_out)
        out = F.softmax(out, dim=-1)
        val, ix = out.data.topk(3, dim=-1)
        outputs[i] = ix[-1][0][0]
        if outputs[i] == EOS_TOKEN:
            break
    result = tokens_to_str(outputs[:i+1])
    if verbose:
        print('Output weights:')
        for j in range(min(3, i)):
            print(f'  {j}', {tokenizer.id_to_subword(k):v.item()
                             for k, v in zip(ix[j][0], val[j][0])})
        print('translation:', result)
    return result

In [0]:
translate(model, 'Машинное обучение это здорово!', verbose=True)

------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁the': 0.13046355545520782, '▁in': 0.04928888380527496, '▁it': 0.045001816004514694}
  1 {'▁the': 0.08483277261257172, '▁in': 0.034705374389886856, '▁i': 0.03179672732949257}
  2 {'▁the': 0.06560049206018448, '▁in': 0.02773398533463478, '▁i': 0.027686230838298798}
translation: <BOS> the the the the the the the the the the the the the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the


'<BOS> the the the the the the the the the the the the the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the first the'

In [0]:
def translate_beam(model, text, max_len=10, beam_capacity=3, verbose=False):
    """
    Algorithm: https://www.youtube.com/watch?v=RLWuzLLSIgw
    """
    model.eval()
    if verbose:
        print('------------ Translation ------------')
        print('Input:', text)
    # Prepare text
    src = tokenizer.encode(text, output_type=yttm.OutputType.ID,
                           bos=True, eos=True)
    src = Tensor(src).long().to(device)
    # Run encoder
    src_enc, src_mask, _ = model.preprocess(src, 'src')
    e_outputs = model.transformer.encoder(src_enc, 
                                          src_mask,
                                          )

    # Prepare tensor for answers
    basic_vec = torch.zeros(max_len).type_as(src.data)
    basic_vec[0] = torch.LongTensor([BOS_TOKEN])

    beam_pool = [(basic_vec, 1.0)]

    def beam_filter(pool, top_k=beam_capacity):
        return sorted(pool, key=lambda x: x[1], reverse=True)[:top_k]

    for i in range(1, max_len):
        if verbose:
            print("Beam epoch: ", i)
        new_pool = []
        # For each candidate path:
        for beam, old_prob in beam_pool:
            outputs_enc, tgt_mask, _ = model.preprocess(beam[:i].unsqueeze(1), 'tgt')
            d_out = model.transformer.decoder(outputs_enc, e_outputs,
                                              tgt_mask=tgt_mask,
                                              )
            out = model.out(d_out)
            out = F.softmax(out, dim=-1)
            probs, ixs = out[-1, :].topk(beam_capacity)
            for prob, token_id in zip(probs.squeeze(), ixs.squeeze()):
                tmp_beam = beam.clone()
                tmp_beam[i] = token_id.item()
                new_pool.append((tmp_beam, prob * old_prob))
        beam_pool = beam_filter(new_pool)
        if verbose:
            for beam, old_prob in beam_pool:
                print("Candidate '{}' with prob: {:.7f}".format(
                    tokens_to_str(beam[1:i + 1]), prob * old_prob
                ))
        # Stop if EOS_TOKEN
        if beam_pool[0][0][i] == EOS_TOKEN:
            break
    the_best = beam_filter(beam_pool, 1)[0][0]
    result = tokens_to_str(the_best[:i+1])
    return result

In [0]:
translate_beam(model, 'Машинное обучение это здорово!', verbose=True)

------------ Translation ------------
Input: Машинное обучение это здорово!
Beam epoch:  1
Candidate 'но-демократи' with prob: 0.0000000
Candidate ' fruit,' with prob: 0.0000000
Candidate 'man,' with prob: 0.0000000
Beam epoch:  2
Candidate 'но-демократино-демократи' with prob: 0.0000000
Candidate 'но-демократи той' with prob: 0.0000000
Candidate 'но-демократи publications,' with prob: 0.0000000
Beam epoch:  3
Candidate 'но-демократино-демократи той' with prob: 0.0000000
Candidate 'но-демократино-демократино-демократи' with prob: 0.0000000
Candidate 'но-демократино-демократи publications,' with prob: 0.0000000
Beam epoch:  4
Candidate 'но-демократино-демократино-демократи той' with prob: 0.0000000
Candidate 'но-демократино-демократи той publications,' with prob: 0.0000000
Candidate 'но-демократино-демократи publications, единым' with prob: 0.0000000
Beam epoch:  5
Candidate 'но-демократино-демократино-демократи той publications,' with prob: 0.0000000
Candidate 'но-демократино-демократи

'<BOS>но-демократино-демократино-демократи той publications, единымно-демократи той publications,'

In [0]:
translate_beam(model, train_data.raw_src[index], verbose=True)

------------ Translation ------------
Input: Затем по нему пробежала вереница символов и стандартных заголовков: "Загрузка черепной коробки".

Beam epoch:  1
Candidate 'рующими' with prob: 0.0000000
Candidate ' установленный' with prob: 0.0000000
Candidate ' глаго' with prob: 0.0000000
Beam epoch:  2
Candidate 'рующими увяз' with prob: 0.0000000
Candidate 'рующими установленный' with prob: 0.0000000
Candidate 'рующими poet' with prob: 0.0000000
Beam epoch:  3
Candidate 'рующими увяз partial' with prob: 0.0000000
Candidate 'рующими увязно-демократи' with prob: 0.0000000
Candidate 'рующими увязрующими' with prob: 0.0000000
Beam epoch:  4
Candidate 'рующими увяз partial увяз' with prob: 0.0000000
Candidate 'рующими увяз partial get' with prob: 0.0000000
Candidate 'рующими увязрующими увяз' with prob: 0.0000000
Beam epoch:  5
Candidate 'рующими увяз partial get увяз' with prob: 0.0000000
Candidate 'рующими увязрующими увяз partial' with prob: 0.0000000
Candidate 'рующими увяз partial увяз 

'<BOS>рующими увяз partial get увяз partial get увяз увяз'