Разобраться с моделькой перевода как она устроена (c механизмом внимания), запустить для перевода с русского на английский (при желании можно взять другие пары языков)
**внимание классическое** с RNN-ками в энкодере и декодере

Возьмем модель из прошлого урока и добавим механизм внимания. (Ещё добавлю лематизацию, чтобы решить проблему с формами слов (издеваться -> издевалась))

In [1]:
# !wget http://www.manythings.org/anki/rus-eng.zip
# !mkdir data
# !unzip rus-eng.zip -d data/
# !ls /content/data/ -lah
# !pip install pymorphy2

## Импорты

In [2]:
import re
import numpy as np
from tqdm import tqdm
from time import time
from functools import lru_cache
from pymorphy2 import MorphAnalyzer
import multiprocessing as mp

from keras import Model
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, GRU, Dense, Layer
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy

import tensorflow as tf
from tensorflow.data import Dataset

from sklearn.model_selection import train_test_split

## Настройки

In [3]:
PATH = "./data/rus.txt"
DATA_LEN = 200000

EPOCHS = 10
BATCH_SIZE = 128
EMBEDDING_DIM = 256
UNITS = 1024

## Данные

Добавим лематизацию

In [4]:
morpher = MorphAnalyzer()

@lru_cache(None)
def lemmatize(word):
    return morpher.parse(word)[0].normal_form

In [5]:
def preprocess(text):
    text = text.strip().lower()
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", text)
    # text = re.sub(r'[" "]+', " ",text)
    text = " ".join([lemmatize(word) for word in text.split(" ")])
    text = text.strip()
    return '<start> ' + text + ' <end>'

In [6]:
def tokenize(data):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(data)
    tensor = tokenizer.texts_to_sequences(data)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [7]:
def preprocess_line(line):
    return [preprocess(text) for text in line.split('\t')[:2]]

def load_data(path, num_examples):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        res = []

        with mp.Pool(mp.cpu_count()) as pool:
            res = pool.map(preprocess_line, tqdm(lines[:num_examples]))

        en, ru = zip(*res)
        ru_tensor, ru_tokenizer = tokenize(ru)
        en_tensor, en_tokenizer = tokenize(en)
        return ru_tensor, en_tensor, ru_tokenizer, en_tokenizer

In [8]:
ru_tensor, en_tensor, ru_tokenizer, en_tokenizer = load_data(PATH, DATA_LEN)

ru_len, en_len = ru_tensor.shape[1], en_tensor.shape[1]
ru_tensor_train, ru_tensor_val, en_tensor_train, en_tensor_val = train_test_split(ru_tensor, en_tensor, test_size=0.2)

BUFFER_SIZE = len(ru_tensor_train)
dataset = Dataset.from_tensor_slices((ru_tensor_train, en_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

100%|██████████| 200000/200000 [00:12<00:00, 15792.72it/s]


## Модель

In [9]:
class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(encoder_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))

In [10]:
class BahdanauAttention(Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [11]:
class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.decoder_units = decoder_units
        self.dense = Dense(vocab_size)
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.attention = BahdanauAttention(decoder_units)
        self.gru = GRU(decoder_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden, encoder_output):
        context_vector, attention_weights = self.attention(hidden, encoder_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.dense(output)
        return x, state, attention_weights

In [12]:
ru_vocab_size = len(ru_tokenizer.word_index)+1
en_vocab_size = len(en_tokenizer.word_index)+1

encoder = Encoder(ru_vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)
decoder = Decoder(en_vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

optimizer = Adam()
loss_obj = SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [13]:
def loss_func(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_obj(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

## Обучение

In [14]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_func(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [15]:
steps_per_epoch = len(ru_tensor_train)//BATCH_SIZE

for epoch in range(EPOCHS):
    start_time = time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
        
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time() - start_time))

Epoch 1 Batch 0 Loss 4.9149
Epoch 1 Batch 100 Loss 2.3652
Epoch 1 Batch 200 Loss 2.0856
Epoch 1 Batch 300 Loss 2.0218
Epoch 1 Batch 400 Loss 2.0052
Epoch 1 Batch 500 Loss 2.0307
Epoch 1 Batch 600 Loss 1.9602
Epoch 1 Batch 700 Loss 1.9463
Epoch 1 Batch 800 Loss 1.8339
Epoch 1 Batch 900 Loss 1.8615
Epoch 1 Batch 1000 Loss 1.8886
Epoch 1 Batch 1100 Loss 1.8962
Epoch 1 Batch 1200 Loss 1.7830
Epoch 1 Loss 2.0239
Time taken for 1 epoch 186.8747889995575 sec

Epoch 2 Batch 0 Loss 1.8031
Epoch 2 Batch 100 Loss 1.7642
Epoch 2 Batch 200 Loss 1.8192
Epoch 2 Batch 300 Loss 1.7616
Epoch 2 Batch 400 Loss 1.7505
Epoch 2 Batch 500 Loss 1.7672
Epoch 2 Batch 600 Loss 1.7742
Epoch 2 Batch 700 Loss 1.8806
Epoch 2 Batch 800 Loss 1.7020
Epoch 2 Batch 900 Loss 1.8299
Epoch 2 Batch 1000 Loss 1.7149
Epoch 2 Batch 1100 Loss 1.7823
Epoch 2 Batch 1200 Loss 1.8204
Epoch 2 Loss 1.7761
Time taken for 1 epoch 171.42716097831726 sec

Epoch 3 Batch 0 Loss 1.7419
Epoch 3 Batch 100 Loss 1.7389
Epoch 3 Batch 200 Loss 1.66

In [16]:
ru_max_len = ru_tensor.shape[1]
en_max_len = en_tensor.shape[1]

def translate(sentence):
    sentence = preprocess(sentence)
    inputs = [ru_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = pad_sequences([inputs],
                            maxlen=ru_max_len,
                            padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''
    hidden = [tf.zeros((1, UNITS))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']], 0)

    for t in range(en_max_len):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if en_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence

        result += en_tokenizer.index_word[predicted_id] + ' '
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [17]:
print(translate('Я смогу пойти с тобой.')[0])

i'll be excused with you . 


In [18]:
print(translate('Я не смогу пойти с тобой.')[0])


i couldn't go with you . 


In [19]:
print(translate('Эта домашняя работа очень долго издевалась надо мной!')[0])

this is made me ! 


In [20]:
print(translate('Холодно ли на улице?')[0])

is it cold outside ? 


In [21]:
print(translate('На улице холодно!')[0])
print(translate('На улице холодно?')[0])

it's cold outside . 
is it cold outside ? 


In [22]:
print(translate('На твоём месте я бы не пошел туда')[0])

i wouldn't go there , i pay after all . 
