In [119]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import codecs

from torch.utils.data import TensorDataset, DataLoader

%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [120]:
df = pd.read_csv(r"C:\Users\RK\rus-eng\rus.txt",sep='\t',names=['eng','rus','c'],encoding='utf-8')[['eng','rus']]

In [121]:
df

Unnamed: 0,eng,rus
0,Go.,Марш!
1,Go.,Иди.
2,Go.,Идите.
3,Hi.,Здравствуйте.
4,Hi.,Привет!
...,...,...
496054,"At a moment when our economy is growing, our b...","В тот момент, когда наша экономика растёт, наш..."
496055,"When I was younger, I hated going to weddings....","Когда я была помоложе, я ненавидела ходить на ..."
496056,Since there are usually multiple websites on a...,"Поскольку сайтов, посвящённых какой-либо теме,..."
496057,If someone who doesn't know your background sa...,"Если кто-то незнакомый говорит, что вы говорит..."


In [122]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [123]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s,lang = 'eng'):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    if lang=='eng':
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    elif lang=='rus':
        r = re.compile(r"[^ЁёА-я.!?]+")
        r.sub(" ", s)
    return s

In [124]:
def readLangs(df, reverse=False):
  
    if reverse:
        lang1 = df.columns[1]
        lang2 = df.columns[0]
    else:
        lang1 = df.columns[0]
        lang2 = df.columns[1]
    
    print(f"Reading lines {lang1} to {lang2}...")
    
    pairs = df.copy()
    
    pairs[lang1] = pairs[lang1].apply(lambda x: normalizeString(x,lang=lang1))
    pairs[lang2] = pairs[lang2].apply(lambda x: normalizeString(x,lang=lang2))
    
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    
    return input_lang, output_lang, pairs[[lang1,lang2]]

In [125]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPairs(df):
    lang1 = df.columns[0]
    lang2 = df.columns[1]
    return df[
                     (df[lang1].str.split(" ").map(len)<MAX_LENGTH) 
                   & (df[lang2].str.split(" ").map(len)<MAX_LENGTH)
                   & (df[lang2].str.startswith(eng_prefixes))
                ].reset_index(drop=True)

In [126]:
def prepareData(df, reverse=False):
    input_lang, output_lang, pairs = readLangs(df, reverse=True)
    print("Read %s sentence pairs" % pairs.shape[0])
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % pairs.shape[0])
    print("Counting words...")
    _ = pairs[pairs.columns[0]].apply(input_lang.addSentence)
    _ = pairs[pairs.columns[1]].apply(output_lang.addSentence)
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData(df,reverse=True)
print(pairs.sample(1))

Reading lines rus to eng...
Read 496059 sentence pairs
Trimmed to 28727 sentence pairs
Counting words...
Counted words:
rus 10804
eng 4306
                       rus                   eng
7368  мне как-то страшно .  i m kind of scared .


### Тренировочный и проверочные датасеты

In [477]:
#get train and test datasets
testcoeff = 0.1
test_pairs  = pairs.sample(int(pairs.shape[0]*testcoeff))
train_pairs = pairs[~pairs.index.isin(test_pairs.index)].to_numpy()
test_pairs = test_pairs.to_numpy()
print(f'train shape: {train_pairs.shape}, test_shape: {test_pairs.shape}')

train shape: (25855, 2), test_shape: (2872, 2)


In [478]:
train_pairs

array([['мне девятнадцать лет .', 'i m .'],
       ['со мнои все в порядке .', 'i m ok .'],
       ['у меня все хорошо .', 'i m ok .'],
       ...,
       ['она проводит каждое воскресенье со своеи бабушкои .',
        'she spends time with her grandmother every sunday .'],
       ['после аварии она перестала бывать на людях .',
        'she stopped appearing in public after her accident .'],
       ['они ведут переговоры, чтобы приити к приемлемому компромиссу .',
        'they are negotiating to reach a satisfactory compromise .']],
      dtype=object)

In [479]:
BATCH_SIZE = 1

The Encoder
-----------


In [140]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,layer_num=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,num_layers=layer_num)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, BATCH_SIZE, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.layer_num, BATCH_SIZE, self.hidden_size, device=device)

The Decoder
-----------

In [141]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,layer_num=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,num_layers=layer_num)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, BATCH_SIZE, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.layer_num, BATCH_SIZE, self.hidden_size, device=device)

In [104]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [77]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [78]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [79]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    
    #training_pairs = [tensorsFromPair(x) for x in train_pairs]
    
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [80]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [81]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [102]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

### Тренируем без изменений моделей

In [24]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 75000, print_every=5000)

0m 59s (- 13m 46s) (5000 6%) 3.0999
1m 49s (- 11m 53s) (10000 13%) 2.6345
2m 40s (- 10m 43s) (15000 20%) 2.3529
3m 32s (- 9m 43s) (20000 26%) 2.1575
4m 23s (- 8m 47s) (25000 33%) 2.0192
5m 14s (- 7m 52s) (30000 40%) 1.8629
6m 6s (- 6m 58s) (35000 46%) 1.7465
6m 57s (- 6m 5s) (40000 53%) 1.6711
7m 49s (- 5m 12s) (45000 60%) 1.5388
8m 41s (- 4m 20s) (50000 66%) 1.4955
9m 33s (- 3m 28s) (55000 73%) 1.4233
10m 24s (- 2m 36s) (60000 80%) 1.3349
11m 16s (- 1m 44s) (65000 86%) 1.3045
12m 8s (- 0m 52s) (70000 93%) 1.2223
13m 0s (- 0m 0s) (75000 100%) 1.1790


In [25]:
evaluateRandomly(encoder1, decoder1)

> вы немного странныи .
= you re a little weird .
< you re a bit strange . <EOS>

> у меня серьезные проблемы .
= i am in deep water .
< i m in trouble . . <EOS>

> мы стоим перед трудным выбором .
= we are faced with a difficult choice .
< we are faced with our uncle . <EOS>

> я до смерти боюсь пресмыкающихся .
= i m deathly afraid of reptiles .
< i m afraid of death . <EOS>

> я рисковая .
= i m adventurous .
< i m an . <EOS>

> вам сюда нельзя .
= you re not allowed in here .
< you re not allowed in . . <EOS>

> ты замечательныи друг .
= you re a wonderful friend .
< you re an friend . <EOS>

> вы все довольны .
= you re all happy .
< you re all right . <EOS>

> я не вооружена .
= i m unarmed .
< i m not an . . <EOS>

> я еще слишком молод для этого .
= i m too young to do that yet .
< i m too young to do this . <EOS>



### Добавим рекурентный слой

In [47]:
layer_num = 2
hidden_size = 256
encoder2 = EncoderRNN(input_lang.n_words, hidden_size,layer_num).to(device)
decoder2 = DecoderRNN(hidden_size, output_lang.n_words,layer_num).to(device)

trainIters(encoder2, decoder2, 75000, print_every=5000)

1m 5s (- 15m 11s) (5000 6%) 3.1344
2m 2s (- 13m 19s) (10000 13%) 2.7124
3m 1s (- 12m 6s) (15000 20%) 2.4445
4m 0s (- 11m 1s) (20000 26%) 2.2477
4m 59s (- 9m 59s) (25000 33%) 2.0614
5m 59s (- 8m 58s) (30000 40%) 1.9540
6m 58s (- 7m 58s) (35000 46%) 1.8326
7m 57s (- 6m 57s) (40000 53%) 1.7264
8m 56s (- 5m 57s) (45000 60%) 1.6329
9m 55s (- 4m 57s) (50000 66%) 1.5137
10m 53s (- 3m 57s) (55000 73%) 1.4571
11m 52s (- 2m 58s) (60000 80%) 1.3684
12m 51s (- 1m 58s) (65000 86%) 1.2925
13m 50s (- 0m 59s) (70000 93%) 1.2558
14m 50s (- 0m 0s) (75000 100%) 1.1885


In [48]:
evaluateRandomly(encoder2, decoder2)

> я читаю книгу об американскои истории .
= i m reading a book on american history .
< i m reading a book of a . . <EOS>

> я немного смущен .
= i m a bit confused .
< i m a bit shy . <EOS>

> мы ваша последняя надежда .
= we re your last hope .
< we re going to be your plan . <EOS>

> рада тебя видеть, том .
= i m glad to see you tom .
< i m glad to you tom . <EOS>

> вы постоянно ко мне придираетесь .
= you re always finding fault with me .
< you re always finding with me . <EOS>

> я чувствую легкии голод .
= i m getting a little hungry .
< i m feeling feeling . <EOS>

> мы сожалеем, что это произошло .
= we re sorry that it happened .
< we re sure that s happened . <EOS>

> ты же не женат ?
= you re single aren t you ?
< you re not married are you ? ? ? <EOS>

> красивая, правда ?
= she s beautiful isn t she ?
< i m dying about everything aren t ? ? <EOS>

> я за ним .
= i m behind him .
< i m ready . <EOS>



### Заменим ячейку GRU на LSTM

In [94]:
class EncoderRNN2(nn.Module):
    def __init__(self, input_size, hidden_size,layer_num=1):
        super(EncoderRNN2, self).__init__()
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size,num_layers=layer_num)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, BATCH_SIZE, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(self.layer_num, BATCH_SIZE, self.hidden_size, device=device),
                torch.zeros(self.layer_num, BATCH_SIZE, self.hidden_size, device=device))

In [95]:
class DecoderRNN2(nn.Module):
    def __init__(self, hidden_size, output_size,layer_num=1):
        super(DecoderRNN2, self).__init__()
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size,num_layers=layer_num)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, BATCH_SIZE, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return (torch.zeros(self.layer_num, BATCH_SIZE, self.hidden_size, device=device),
                torch.zeros(self.layer_num, BATCH_SIZE, self.hidden_size, device=device))

In [89]:
layer_num = 1
hidden_size = 256
encoder3 = EncoderRNN2(input_lang.n_words, hidden_size, layer_num).to(device)
decoder3 = DecoderRNN2(hidden_size, output_lang.n_words, layer_num).to(device)

trainIters(encoder3, decoder3, 75000, print_every=5000)

0m 58s (- 13m 35s) (5000 6%) 3.2439
1m 49s (- 11m 54s) (10000 13%) 2.7884
2m 42s (- 10m 49s) (15000 20%) 2.5888
3m 34s (- 9m 50s) (20000 26%) 2.3689
4m 27s (- 8m 54s) (25000 33%) 2.2432
5m 20s (- 8m 0s) (30000 40%) 2.0884
6m 13s (- 7m 7s) (35000 46%) 1.9598
7m 7s (- 6m 13s) (40000 53%) 1.8978
8m 0s (- 5m 20s) (45000 60%) 1.7956
8m 53s (- 4m 26s) (50000 66%) 1.7104
9m 46s (- 3m 33s) (55000 73%) 1.6203
10m 39s (- 2m 39s) (60000 80%) 1.5582
11m 32s (- 1m 46s) (65000 86%) 1.4897
12m 25s (- 0m 53s) (70000 93%) 1.4260
13m 18s (- 0m 0s) (75000 100%) 1.3684


In [105]:
evaluateRandomly(encoder3, decoder3)

> успех ему обеспечен .
= he is bound to succeed .
< he is the . . . <EOS>

> я счастливее вас .
= i m happier than you .
< i m older than you . <EOS>

> я не говорю, что его музыка плохая .
= i m not saying his music is bad .
< i m not ashamed of what i . <EOS>

> мне надоело слушать тома .
= i m sick of listening to tom .
< i m tired of tom s . . <EOS>

> я пишу эссе .
= i m writing an essay .
< i m writing a letter . <EOS>

> вы обе симпатичные .
= you re both pretty .
< you re both up . <EOS>

> я жду своеи очереди .
= i m waiting for my turn .
< i m looking my my . . <EOS>

> я худая .
= i m thin .
< i m an . <EOS>

> я никчемен .
= i m useless .
< i m a . <EOS>

> ты опять тут .
= you re back again .
< you re back here . <EOS>



### Добавим слой к LSTM

In [96]:
layer_num = 2
hidden_size = 256
encoder4 = EncoderRNN2(input_lang.n_words, hidden_size,layer_num).to(device)
decoder4 = DecoderRNN2(hidden_size, output_lang.n_words,layer_num).to(device)

trainIters(encoder4, decoder4, 75000, print_every=5000)

1m 5s (- 15m 17s) (5000 6%) 3.3529
2m 3s (- 13m 22s) (10000 13%) 2.8448
3m 2s (- 12m 10s) (15000 20%) 2.6480
4m 1s (- 11m 2s) (20000 26%) 2.4950
5m 0s (- 10m 0s) (25000 33%) 2.3934
5m 59s (- 8m 58s) (30000 40%) 2.2527
6m 57s (- 7m 57s) (35000 46%) 2.1321
7m 56s (- 6m 56s) (40000 53%) 2.0132
8m 55s (- 5m 56s) (45000 60%) 1.9300
9m 54s (- 4m 57s) (50000 66%) 1.8367
10m 52s (- 3m 57s) (55000 73%) 1.7785
11m 52s (- 2m 58s) (60000 80%) 1.6856
12m 51s (- 1m 58s) (65000 86%) 1.6026
13m 52s (- 0m 59s) (70000 93%) 1.5375
15m 1s (- 0m 0s) (75000 100%) 1.4619


In [106]:
evaluateRandomly(encoder4, decoder4)

> вы не туда идете .
= you re going the wrong direction .
< you re going to go there . <EOS>

> я доволен тем, что сделал том .
= i m happy with what tom did .
< i m happy to see tom . . <EOS>

> я заслуживаю доверия .
= i m reliable .
< i m the one who understands . <EOS>

> я оставлю книги здесь .
= i am leaving the books here .
< i m going to go here . <EOS>

> ты повыше меня .
= you re a bit taller than i am .
< you re a than me am . <EOS>

> ты странныи парень .
= you re a weird guy .
< you re a strange person . <EOS>

> у тебя носки надеты наизнанку .
= you re wearing your socks inside out .
< you re asking in good way . <EOS>

> я здесь не для того, чтобы кому-то навредить .
= i m not here to hurt anybody .
< i m not the only one here . <EOS>

> я просто шучу .
= i m only joking .
< i m just just . <EOS>

> я привыкла сама себе готовить .
= i m used to cooking for myself .
< i m getting to french french . . <EOS>



ВЫВОДЫ: LSTM ячейки обучаются дольше, чем GRU, как и должно быть. 
Увеличение числа скрытых слоёв даёт худший результат на маленьком количестве итераций. 
Все методы показали достаточно неплохое качество переаода

## *Обучение батчами 

In [466]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

#функция преобразована для обучения на батчах
def tensorFromSentence2(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    #indexes.append(EOS_token)
    indexes = indexes + [EOS_token]*(MAX_LENGTH-len(indexes))
    return np.array(indexes)
#torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair2(pair):
    input_tensor = tensorFromSentence2(input_lang, pair[0])
    target_tensor = tensorFromSentence2(output_lang, pair[1])
    return np.array([input_tensor, target_tensor])

In [480]:
BATCH_SIZE = 32
train_pairs = np.array([tensorsFromPair2(x) for x in train_pairs])
test_pairs = np.array([tensorsFromPair2(x) for x in test_pairs])

#train_pairs = train_pairs[np.argsort((train_pairs[:,1,:] == 1).sum(axis=1))]
#test_pairs = test_pairs[np.argsort((test_pairs[:,1,:] == 1).sum(axis=1))]

train_dataset = TensorDataset(torch.Tensor(train_pairs[:,0,:]).long(),torch.Tensor(train_pairs[:,1,:]).long())
test_dataset = TensorDataset(torch.Tensor(test_pairs[:,0,:]).long(),torch.Tensor(test_pairs[:,1,:]).long())

train_dl = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
test_dl = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)

In [481]:
def trainIters(encoder, decoder, epoch=10, print_every=100, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()
    
    for ep in range(epoch):
        train_iters, train_passed  = 0, 0
        train_loss, train_acc = 0., 0.
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        
        for X, y in train_dl:
            if X.shape[0] !=BATCH_SIZE:
                    continue

            input_tensor, target_tensor = X.to(device), y.to(device)
            #input_tensor = training_pair[0]
            #target_tensor = training_pair[1]
            
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            
            print_loss_total += loss
            plot_loss_total += loss
            
            train_iters += 1
            train_passed += len(X)
            

        if ep % 1 == 0:
            print_loss_avg = print_loss_total / train_iters
            print_loss_total = 0 
            print('%s (%d %d%%) %.4f' % (timeSince(start, (ep+1) / epoch),
                                             ep, ep / epoch * 100, print_loss_avg))

    #showPlot(plot_losses)

In [482]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    #ограничиваем длиной минимального предложения в батче, иначе модель, демонстрируя отличный loss на обучении
    #на тестовых данных показывает какую то погоду
    input_length = input_tensor.size(1) #max_length - (input_tensor == 1).sum(axis=1).max().item() + 1 
    target_length = target_tensor.size(1) #max_length - (target_tensor == 1).sum(axis=1).max().item() + 1 
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    #print(input_length)
    loss = 0
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[:,ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]*BATCH_SIZE], device=device).view(-1,1)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output.squeeze(), target_tensor[:,di])
            decoder_input = target_tensor[:,di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output,target_tensor[:,di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    
    #print(loss)
    return loss.item() / target_length

In [483]:
BATCH_SIZE=32
hidden_size = 256
encoderB = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoderB = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoderB, decoderB, 30, print_every=100)

0m 11s (- 5m 42s) (0 0%) 2.9880
0m 23s (- 5m 28s) (1 3%) 2.5612
0m 35s (- 5m 15s) (2 6%) 2.4301
0m 46s (- 5m 2s) (3 10%) 2.3127
0m 58s (- 4m 50s) (4 13%) 2.2680
1m 9s (- 4m 38s) (5 16%) 2.2074
1m 21s (- 4m 26s) (6 20%) 2.1379
1m 32s (- 4m 15s) (7 23%) 2.0815
1m 44s (- 4m 3s) (8 26%) 2.0319
1m 55s (- 3m 51s) (9 30%) 1.9558
2m 7s (- 3m 40s) (10 33%) 1.9238
2m 19s (- 3m 29s) (11 36%) 1.8754
2m 30s (- 3m 17s) (12 40%) 1.8397
2m 42s (- 3m 5s) (13 43%) 1.8121
2m 54s (- 2m 54s) (14 46%) 1.7873
3m 5s (- 2m 42s) (15 50%) 1.7570
3m 17s (- 2m 30s) (16 53%) 1.7340
3m 28s (- 2m 19s) (17 56%) 1.6999
3m 40s (- 2m 7s) (18 60%) 1.6637
3m 52s (- 1m 56s) (19 63%) 1.6429
4m 3s (- 1m 44s) (20 66%) 1.6116
4m 15s (- 1m 32s) (21 70%) 1.5981
4m 26s (- 1m 21s) (22 73%) 1.5702
4m 38s (- 1m 9s) (23 76%) 1.5347
4m 49s (- 0m 57s) (24 80%) 1.5249
5m 1s (- 0m 46s) (25 83%) 1.4969
5m 13s (- 0m 34s) (26 86%) 1.4848
5m 24s (- 0m 23s) (27 90%) 1.4548
5m 36s (- 0m 11s) (28 93%) 1.4303
5m 47s (- 0m 0s) (29 96%) 1.4061


In [484]:
BATCH_SIZE = 1
def evaluateB(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        #input_tensor = tensorFromSentence(input_lang, sentence)
        input_tensor = torch.Tensor(sentence).long().to(device)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            #print(input_tensor[ei])
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]
       
        
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                #print(topi.item())
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [485]:
def evaluateRandomlyB(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(train_pairs)
        print('>', ' '.join([input_lang.index2word[x] for x in pair[0]]))
        print('=', ' '.join([output_lang.index2word[x] for x in pair[1]]))
        output_words = evaluateB(encoderB, decoderB, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [487]:
evaluateRandomlyB(encoder4, decoder4)

> он боится делать ошибки . EOS EOS EOS EOS EOS
= he s afraid of making mistakes . EOS EOS EOS
< he is afraid to be a . . <EOS>

> я не уверена . EOS EOS EOS EOS EOS EOS
= i am not sure . EOS EOS EOS EOS EOS
< i m not sure . <EOS>

> нам будет вас не хватать . EOS EOS EOS EOS
= we re going to miss you . EOS EOS EOS
< we re going to miss you . <EOS>

> я стоматолог . EOS EOS EOS EOS EOS EOS EOS
= i m a dentist . EOS EOS EOS EOS EOS
< i m a . <EOS>

> он всегда опаздывает в школу . EOS EOS EOS EOS
= he is always late for school . EOS EOS EOS
< he is always in in the . <EOS>

> вы не много теряете . EOS EOS EOS EOS EOS
= you re not missing much . EOS EOS EOS EOS
< you re not the wrong . <EOS>

> я вправе иметь собственное мнение . EOS EOS EOS EOS
= i m entitled to my own opinion . EOS EOS
< i m a to to to the . . <EOS>

> я благоразумен . EOS EOS EOS EOS EOS EOS EOS
= i m prudent . EOS EOS EOS EOS EOS EOS
< i m a . <EOS>

> он здесь для того, чтобы защитить тебя . EOS EOS
= he s here to p