In [1]:
# !wget http://www.manythings.org/anki/rus-eng.zip
# !mkdir data
# !unzip rus-eng.zip -d data/
# !ls /content/data/ -lah

## Импорты

In [2]:
import re
from time import time

from keras import Model
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, GRU, Dense
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy

import tensorflow as tf
from tensorflow.data import Dataset

from sklearn.model_selection import train_test_split

## Настройки

In [3]:
PATH = "./data/rus.txt"
DATA_LEN = 200000

EPOCHS = 10
BATCH_SIZE = 64
EMBEDDING_DIM = 300
UNITS = 1024

## Данные

In [4]:
def preprocess(text):
    text = text.strip().lower()
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ",text)
    text = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", text)
    text = text.strip()
    return '<start> ' + text + ' <end>'

In [5]:
def tokenize(data):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(data)
    tensor = tokenizer.texts_to_sequences(data)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [6]:
def load_data(path, num_examples):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        res = []

        for line in lines[:num_examples]:
            res.append([preprocess(text) for text in line.split('\t')[:2]])

        en, ru = zip(*res)
        ru_tensor, ru_tokenizer = tokenize(ru)
        en_tensor, en_tokenizer = tokenize(en)
        return ru_tensor, en_tensor, ru_tokenizer, en_tokenizer

In [7]:
ru_tensor, en_tensor, ru_tokenizer, en_tokenizer = load_data(PATH, DATA_LEN)

ru_len, en_len = ru_tensor.shape[1], en_tensor.shape[1]
ru_tensor_train, ru_tensor_val, en_tensor_train, en_tensor_val = train_test_split(ru_tensor, en_tensor, test_size=0.2)

BUFFER_SIZE = len(ru_tensor_train)
dataset = Dataset.from_tensor_slices((ru_tensor_train, en_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## Модель

In [8]:
class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(encoder_units,
                       return_sequences=False,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))

In [9]:
class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.decoder_units = decoder_units
        self.dense = Dense(vocab_size)
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(decoder_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.dense(output)
        return x, state

In [10]:
ru_vocab_size = len(ru_tokenizer.word_index)+1
en_vocab_size = len(en_tokenizer.word_index)+1

encoder = Encoder(ru_vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)
decoder = Decoder(en_vocab_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

optimizer = Adam()
loss_obj = SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [11]:
def loss_func(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_obj(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

## Обучение

In [12]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden)
            loss += loss_func(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [13]:
steps_per_epoch = len(ru_tensor_train)//BATCH_SIZE

for epoch in range(EPOCHS):
    start_time = time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
        
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time() - start_time))

Epoch 1 Batch 0 Loss 4.6802
Epoch 1 Batch 100 Loss 2.2353
Epoch 1 Batch 200 Loss 1.9267
Epoch 1 Batch 300 Loss 1.8847
Epoch 1 Batch 400 Loss 1.7956
Epoch 1 Batch 500 Loss 1.6288
Epoch 1 Batch 600 Loss 1.6078
Epoch 1 Batch 700 Loss 1.5801
Epoch 1 Batch 800 Loss 1.5211
Epoch 1 Batch 900 Loss 1.3857
Epoch 1 Batch 1000 Loss 1.4045
Epoch 1 Batch 1100 Loss 1.4904
Epoch 1 Batch 1200 Loss 1.2195
Epoch 1 Batch 1300 Loss 1.2420
Epoch 1 Batch 1400 Loss 1.0829
Epoch 1 Batch 1500 Loss 1.0045
Epoch 1 Batch 1600 Loss 1.0912
Epoch 1 Batch 1700 Loss 1.1163
Epoch 1 Batch 1800 Loss 1.0809
Epoch 1 Batch 1900 Loss 0.8187
Epoch 1 Batch 2000 Loss 0.9313
Epoch 1 Batch 2100 Loss 0.8531
Epoch 1 Batch 2200 Loss 0.8059
Epoch 1 Batch 2300 Loss 0.7537
Epoch 1 Batch 2400 Loss 0.8523
Epoch 1 Loss 1.3052
Time taken for 1 epoch 153.2351679801941 sec

Epoch 2 Batch 0 Loss 0.6106
Epoch 2 Batch 100 Loss 0.6669
Epoch 2 Batch 200 Loss 0.7038
Epoch 2 Batch 300 Loss 0.6571
Epoch 2 Batch 400 Loss 0.6142
Epoch 2 Batch 500 Loss 

In [33]:
ru_max_len = ru_tensor.shape[1]
en_max_len = en_tensor.shape[1]

def translate(sentence):
    sentence = preprocess(sentence)
    inputs = [ru_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = pad_sequences([inputs],
                            maxlen=ru_max_len,
                            padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''
    hidden = [tf.zeros((1, UNITS))]
    enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']], 0)

    for t in range(en_max_len):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if en_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence

        result += en_tokenizer.index_word[predicted_id] + ' '
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [34]:
print(translate('Я смогу пойти с тобой.')[0])

i'll go with you . 


In [35]:
print(translate('Я не смогу пойти с тобой.')[0])


i can't go with you . 


In [36]:
print(translate('Эта домашняя работа очень долго издеваться надо мной!')[0]) # издеваться -> в словаре небыло слова издевалась

this day flows to do . 


Звучит как троллинг

In [37]:
print(translate('Холодно ли на улице?')[0])

is it cold outside ? 


In [38]:
print(translate('На улице холодно!')[0])
print(translate('На улице холодно?')[0])

it's cold outside . 
is it cold outside ? 


In [39]:
print(translate('На твоём месте я бы не пошел туда')[0])

i'd go if i were up there . 
