## Импорты

In [19]:
!pip install pymorphy2



In [20]:
import re
import numpy as np
from tqdm import tqdm
from time import time
from functools import lru_cache
from pymorphy2 import MorphAnalyzer
import multiprocessing as mp

from keras import Model
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, GRU, Dense, Layer
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy

import tensorflow as tf
from tensorflow.data import Dataset

from sklearn.model_selection import train_test_split

In [21]:
from google.colab import files

## Настройки

In [22]:
files.upload()

Saving rus.txt to rus (1).txt


In [23]:
!ls


'rus (1).txt'   rus.txt   sample_data


In [24]:
PATH = "./rus.txt"
DATA_LEN = 200000

EPOCHS = 10
BATCH_SIZE = 128
EMBEDDING_DIM = 256
UNITS = 1024

## Данные

In [25]:
morpher = MorphAnalyzer()

@lru_cache(None)
def lemmatize(word):
    return morpher.parse(word)[0].normal_form

In [26]:
def preprocess(text):
    text = text.strip().lower()
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", text)
    # text = re.sub(r'[" "]+', " ",text)
    text = " ".join([lemmatize(word) for word in text.split(" ")])
    text = text.strip()
    return '<start> ' + text + ' <end>'

In [27]:
def tokenize(data):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(data)
    tensor = tokenizer.texts_to_sequences(data)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [28]:
def preprocess_line(line):
    return [preprocess(text) for text in line.split('\t')[:2]]

def load_data(path, num_examples):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        res = []

        with mp.Pool(mp.cpu_count()) as pool:
            res = pool.map(preprocess_line, tqdm(lines[:num_examples]))

        en, ru = zip(*res)
        ru_tensor, ru_tokenizer = tokenize(ru)
        en_tensor, en_tokenizer = tokenize(en)
        return ru_tensor, en_tensor, ru_tokenizer, en_tokenizer

In [29]:
ru_tensor, en_tensor, ru_tokenizer, en_tokenizer = load_data(PATH, DATA_LEN)

ru_len, en_len = ru_tensor.shape[1], en_tensor.shape[1]
ru_tensor_train, ru_tensor_val, en_tensor_train, en_tensor_val = train_test_split(ru_tensor, en_tensor, test_size=0.2)

BUFFER_SIZE = len(ru_tensor_train)
dataset = Dataset.from_tensor_slices((ru_tensor_train, en_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

100%|██████████| 200000/200000 [00:14<00:00, 14240.18it/s]


## Модель

In [31]:
class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(encoder_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))

In [32]:
class BahdanauAttention(Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [33]:
class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.decoder_units = decoder_units
        self.dense = Dense(vocab_size)
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.attention = BahdanauAttention(decoder_units)
        self.gru = GRU(decoder_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden, encoder_output):
        context_vector, attention_weights = self.attention(hidden, encoder_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.dense(output)
        return x, state, attention_weights

In [34]:
ru_vocab_size = len(ru_tokenizer.word_index)+1
en_vocab_size = len(en_tokenizer.word_index)+1

encoder = Encoder(ru_vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)
decoder = Decoder(en_vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

optimizer = Adam()
loss_obj = SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [35]:
def loss_func(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_obj(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

## Обучение

In [36]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_func(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [37]:
steps_per_epoch = len(ru_tensor_train)//BATCH_SIZE

for epoch in range(EPOCHS):
    start_time = time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time() - start_time))

Epoch 1 Batch 0 Loss 4.8011
Epoch 1 Batch 100 Loss 2.2326
Epoch 1 Batch 200 Loss 2.0248
Epoch 1 Batch 300 Loss 1.7915
Epoch 1 Batch 400 Loss 1.6024
Epoch 1 Batch 500 Loss 1.4643
Epoch 1 Batch 600 Loss 1.4605
Epoch 1 Batch 700 Loss 1.2659
Epoch 1 Batch 800 Loss 1.1239
Epoch 1 Batch 900 Loss 1.0564
Epoch 1 Batch 1000 Loss 0.9452
Epoch 1 Batch 1100 Loss 0.8922
Epoch 1 Batch 1200 Loss 0.6844
Epoch 1 Loss 1.4541
Time taken for 1 epoch 201.7161033153534 sec

Epoch 2 Batch 0 Loss 0.7139
Epoch 2 Batch 100 Loss 0.6289
Epoch 2 Batch 200 Loss 0.6877
Epoch 2 Batch 300 Loss 0.5997
Epoch 2 Batch 400 Loss 0.5445
Epoch 2 Batch 500 Loss 0.5613
Epoch 2 Batch 600 Loss 0.5070
Epoch 2 Batch 700 Loss 0.4436
Epoch 2 Batch 800 Loss 0.4691
Epoch 2 Batch 900 Loss 0.5114
Epoch 2 Batch 1000 Loss 0.4821
Epoch 2 Batch 1100 Loss 0.4300
Epoch 2 Batch 1200 Loss 0.4522
Epoch 2 Loss 0.5414
Time taken for 1 epoch 180.532696723938 sec

Epoch 3 Batch 0 Loss 0.3273
Epoch 3 Batch 100 Loss 0.3212
Epoch 3 Batch 200 Loss 0.4097

In [38]:
ru_max_len = ru_tensor.shape[1]
en_max_len = en_tensor.shape[1]

def translate(sentence):
    sentence = preprocess(sentence)
    inputs = [ru_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = pad_sequences([inputs],
                            maxlen=ru_max_len,
                            padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''
    hidden = [tf.zeros((1, UNITS))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']], 0)

    for t in range(en_max_len):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if en_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence

        result += en_tokenizer.index_word[predicted_id] + ' '
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [39]:
print(translate('Я буду делать домашнее задание.')[0])

i'll do my homework . 


In [40]:
print(translate('Я не буду делать домашнее задание.')[0])


i won't do my homework . 


In [41]:
print(translate('Эта домашняя работа очень долго издевалась надо мной!')[0])

my homework is very clear . 


In [42]:
print(translate('Тепло ли на улице?')[0])

is it warm outside ? 


In [43]:
print(translate('На улице жарко!')[0])
print(translate('На улице жарко?')[0])

it's hot outside ! 
is it hot outside ? 


In [44]:
print(translate('На твоём месте я бы не пошел туда')[0])

i'd go if i go there if i go there if i 
