# Домашняя работа

Разобраться с моделькой перевода как она устроена
запустить для перевода с русского на английский (при желании можно взять другие пары языков) два варианта с вниманием и без внимания 
оценить качество насколько корректно переводит (для теста отобрать примеры с увеличением длины текста) (так как оценка визуальная достаточно 20-ти примеров в тестовой выборке)

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import os

## Загрузка данных и препоцессинг

In [3]:
!wget http://www.manythings.org/anki/rus-eng.zip

--2022-09-06 15:40:08--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15011848 (14M) [application/zip]
Saving to: ‘rus-eng.zip.1’


2022-09-06 15:40:55 (318 KB/s) - ‘rus-eng.zip.1’ saved [15011848/15011848]



In [4]:
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng/

Archive:  rus-eng.zip
  inflating: rus-eng/rus.txt         
  inflating: rus-eng/_about.txt      


In [2]:
TEXT_PATH = 'rus-eng/rus.txt'
START_TOK = '<start>'
END_TOK   = '<end>'

In [3]:
def make_dataset():
    lines = open(TEXT_PATH, encoding='UTF-8').read().strip().split('\n')
    data = [[START_TOK+' '+sent+' '+END_TOK for sent in l.split('\t')[:2]] for l in lines]
    return pd.DataFrame(data=data, columns=['en', 'ru'])

In [4]:
df = make_dataset()
df.head()

Unnamed: 0,en,ru
0,<start> Go. <end>,<start> Марш! <end>
1,<start> Go. <end>,<start> Иди. <end>
2,<start> Go. <end>,<start> Идите. <end>
3,<start> Hi. <end>,<start> Здравствуйте. <end>
4,<start> Hi. <end>,<start> Привет! <end>


In [5]:
def tokenize(lang):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(lang)
    tensor = tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [6]:
en_data, en_tokenizer = tokenize(df['en'])
ru_data, ru_tokenizer = tokenize(df['ru'])

In [7]:
en_data.shape, ru_data.shape, 

((451436, 103), (451436, 82))

In [8]:
BUFFER_SIZE = len(en_data)
BATCH_SIZE = 64
dataset = tf.data.Dataset.from_tensor_slices((ru_data, en_data)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2022-09-07 10:18:57.339702: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-09-07 10:18:57.340232: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Обучение

In [9]:
VOCAB_EN_SIZE = len(en_tokenizer.word_index)+1
VOCAB_RU_SIZE = len(ru_tokenizer.word_index)+1
EMBEDDING_DIM = 300
UNITS = 1024

In [10]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.units = units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.units))

In [11]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    

    def call(self, query, values):
        # query hidden state shape (batch_size, hidden size)
        # values shape (batch_size, max_len, hidden size)
        # score shape (batch_size, max_length, 1)
        score = self.V(tf.nn.tanh(self.W1(tf.expand_dims(query, 1)) + self.W2(values)))
        weights = tf.nn.softmax(score, axis=1)  # (batch_size, max_length, 1)
        context_vector = tf.reduce_sum(weights * values, axis=1)
        return context_vector, weights

In [12]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units, batch_size):
    super(Decoder, self).__init__()
    self.batch_size = batch_size
    self.units = units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = Attention(self.units)

  def call(self, x, hidden, enc_output):
    # enc_output shape (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)
    
    x = self.embedding(x)
    
    # (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    
    output, state = self.gru(x)
    
    # (batch_size, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [13]:
encoder = Encoder(VOCAB_RU_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)
decoder = Decoder(VOCAB_EN_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)

In [14]:
optimizer = tf.keras.optimizers.Adam()
loss_func = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [15]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0.0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([en_tokenizer.word_index[START_TOK]] * BATCH_SIZE, 1)
        for i in range(1, targ.shape[1]):
            preds, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_func(targ[:, i], preds)
            dec_input = tf.expand_dims(targ[:, i], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [16]:
EPOCHS = 50

In [17]:
import time


steps_per_epoch = len(ru_data) // BATCH_SIZE
for epoch in range(EPOCHS):
    start_time = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if batch % 100 == 0:
        print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy()}')

    elapsed_time = time.time() - start_time
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss {total_loss / steps_per_epoch}, {elapsed_time} sec\n')

2022-09-07 10:19:48.741916: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-09-07 10:19:48.929691: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 10:19:57.676045: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 10:20:00.545439: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 10:20:01.282925: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 10:20:01.487801: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 10:20:01.566477: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113