In [1]:
%matplotlib inline

from io import open
import unicodedata
import string
import re
import random

import pandas as pd
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
!wget https://www.manythings.org/anki/rus-eng.zip
!unzip rus-eng.zip

--2023-09-23 18:19:46--  https://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15824155 (15M) [application/zip]
Saving to: ‘rus-eng.zip’


2023-09-23 18:19:47 (19.8 MB/s) - ‘rus-eng.zip’ saved [15824155/15824155]

Archive:  rus-eng.zip
  inflating: rus.txt                 
  inflating: _about.txt              


In [3]:
!head rus.txt

Go.	Марш!	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1159202 (shanghainese)
Go.	Иди.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5898247 (marafon)
Go.	Идите.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #5898250 (marafon)
Hi.	Здравствуйте.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #402127 (odexed)
Hi.	Привет!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #466968 (katjka)
Hi.	Хай.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #467233 (timsa)
Hi.	Здрасте.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #3803577 (marafon)
Hi.	Здоро́во!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #3854188 (marafon)
Hi.	Приветик!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #7234283 (marafon)
Run!	Беги!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #1569978 (Biga)


In [4]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z-А-аЯ-я.!?]+", r" ", s)
    return s

In [6]:
def readLangs(lang1 = 'eng', lang2 = 'rus', reverse=False):
    print("Reading lines...")
    
    # Read the file and split into lines
    lines = open('rus.txt', encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')[:2]] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Reading lines...
Read 479223 sentence pairs
Trimmed to 27844 sentence pairs
Counting words...
Counted words:
rus 10125
eng 4320
['я собираюсь купить новую машину .', 'i am going to buy a new car .']


The Encoder
-----------





In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------




In [10]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [12]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_losses.append(print_loss_avg)
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
    return print_losses

In [15]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [16]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [17]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [18]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

GRU = trainIters(encoder1, decoder1, 75000, print_every=5000)

1m 8s (- 16m 5s) (5000 6%) 3.1146
2m 8s (- 13m 52s) (10000 13%) 2.6256
3m 6s (- 12m 25s) (15000 20%) 2.3421
4m 6s (- 11m 17s) (20000 26%) 2.1445
5m 6s (- 10m 13s) (25000 33%) 2.0145
6m 7s (- 9m 10s) (30000 40%) 1.8395
7m 7s (- 8m 8s) (35000 46%) 1.7564
8m 7s (- 7m 6s) (40000 53%) 1.6396
9m 7s (- 6m 4s) (45000 60%) 1.5584
10m 7s (- 5m 3s) (50000 66%) 1.4903
11m 7s (- 4m 2s) (55000 73%) 1.4264
12m 7s (- 3m 1s) (60000 80%) 1.3373
13m 7s (- 2m 1s) (65000 86%) 1.2824
14m 7s (- 1m 0s) (70000 93%) 1.2237
15m 8s (- 0m 0s) (75000 100%) 1.1650


In [19]:
evaluateRandomly(encoder1, decoder1)

> вы здесь в безопасности .
= you re safe here .
< you re safe here danger . <EOS>

> я уверен что вы заслуживаете лучшего .
= i m sure you deserve better .
< i m sure you re found . . <EOS>

> я удивлена что тома здесь нет .
= i m surprised tom isn t here .
< i m surprised tom isn t here here . <EOS>

> тебе рано идти в армию .
= you re too young to join the army .
< you re too young to join the army . <EOS>

> я сеичас ем .
= i m eating now .
< i m eating now . <EOS>

> она в туалете .
= she s in the bathroom .
< she is in the . . <EOS>

> я сделаю тебе укол .
= i m going to give you an injection .
< i m going to get you a . . <EOS>

> ты чуть повыше меня .
= you re a little taller than i am .
< you re a little taller than i am . <EOS>

> я безжалостныи .
= i m ruthless .
< i m sure . <EOS>

> он такого же роста как мои отец .
= he is as tall as my father .
< he s as tall as my father . <EOS>



In [20]:
res = pd.DataFrame()
res['1_GRU'] = GRU
res

Unnamed: 0,1_GRU
0,3.114554
1,2.625627
2,2.342106
3,2.144511
4,2.014533
5,1.839546
6,1.75639
7,1.639556
8,1.558369
9,1.490343


### Попробуем 2 рекуррентных слоя вместо 1

In [47]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,num_layers=2)
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(2, 1, self.hidden_size, device=device)

In [48]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(2, 1, self.hidden_size, device=device)

In [49]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

GRU_2 = trainIters(encoder1, decoder1, 75000, print_every=5000)

1m 14s (- 17m 25s) (5000 6%) 3.1487
2m 23s (- 15m 33s) (10000 13%) 2.6643
3m 34s (- 14m 17s) (15000 20%) 2.4316
4m 43s (- 13m 0s) (20000 26%) 2.2138
5m 52s (- 11m 45s) (25000 33%) 2.0863
7m 1s (- 10m 31s) (30000 40%) 1.9435
8m 9s (- 9m 18s) (35000 46%) 1.8108
9m 17s (- 8m 7s) (40000 53%) 1.7151
10m 26s (- 6m 57s) (45000 60%) 1.6104
11m 37s (- 5m 48s) (50000 66%) 1.5421
12m 47s (- 4m 39s) (55000 73%) 1.4435
13m 57s (- 3m 29s) (60000 80%) 1.3908
15m 7s (- 2m 19s) (65000 86%) 1.2992
16m 18s (- 1m 9s) (70000 93%) 1.2270
17m 28s (- 0m 0s) (75000 100%) 1.1931


In [50]:
evaluateRandomly(encoder1, decoder1)

> он американец .
= he is american .
< he is going to <EOS>

> вы ужасны .
= you re terrible .
< you re getting . <EOS>

> ты докапываешься до мелочеи .
= you re splitting hairs .
< you re going to the . . <EOS>

> он младше меня .
= he s younger than me .
< he is younger than me . <EOS>

> ты ведь дочь тома ?
= you re tom s daughter aren t you ?
< you re tom s daughter aren t you ? <EOS>

> он пацифист .
= he s a pacifist .
< he s a . . <EOS>

> прости если задел твои чувства .
= i m sorry if i hurt your feelings .
< i m sorry if i your your your . <EOS>

> вы белая как простыня .
= you re white as a sheet .
< you re as white a sheet a sheet . <EOS>

> нам так весело .
= we re having so much fun .
< we re so kind . <EOS>

> она хорошо целуется .
= she s a good kisser .
< she s well . <EOS>



In [51]:
res['2_GRU'] = GRU_2
res

Unnamed: 0,1_GRU,2_GRU
0,3.114554,3.148655
1,2.625627,2.664298
2,2.342106,2.431606
3,2.144511,2.213844
4,2.014533,2.086283
5,1.839546,1.943527
6,1.75639,1.810832
7,1.639556,1.715083
8,1.558369,1.61035
9,1.490343,1.542077


### Попробуем GRU слой заменить на LSTM

In [83]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.LSTM(output, hidden)
        return output, hidden

    def initHidden(self):
        h = torch.zeros(1, 1, self.hidden_size, device=device)
        c = torch.zeros(1, 1, self.hidden_size, device=device)
        return h,c

In [84]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.LSTM(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        h = torch.zeros(1, 1, self.hidden_size, device=device)
        c = torch.zeros(1, 1, self.hidden_size, device=device)
        return h,c

In [85]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

LSTM_2 = trainIters(encoder1, decoder1, 75000, print_every=5000)

1m 6s (- 15m 37s) (5000 6%) 3.2031
2m 9s (- 14m 1s) (10000 13%) 2.7595
3m 12s (- 12m 49s) (15000 20%) 2.5654
4m 15s (- 11m 43s) (20000 26%) 2.3922
5m 18s (- 10m 36s) (25000 33%) 2.2260
6m 21s (- 9m 32s) (30000 40%) 2.1183
7m 25s (- 8m 28s) (35000 46%) 1.9887
8m 28s (- 7m 25s) (40000 53%) 1.9049
9m 32s (- 6m 21s) (45000 60%) 1.7934
10m 35s (- 5m 17s) (50000 66%) 1.7471
11m 41s (- 4m 14s) (55000 73%) 1.6513
12m 45s (- 3m 11s) (60000 80%) 1.5865
13m 49s (- 2m 7s) (65000 86%) 1.5224
14m 53s (- 1m 3s) (70000 93%) 1.4529
15m 56s (- 0m 0s) (75000 100%) 1.4006


In [86]:
evaluateRandomly(encoder1, decoder1)

> она очень быстрая .
= she is very fast .
< she is very fast . <EOS>

> вы в два раза старше меня .
= you re twice as old as i am .
< you re twice as old as i am . <EOS>

> он худощавыи .
= he s skinny .
< he is a . . <EOS>

> я рад что люди ее видели .
= i m glad people saw it .
< i m glad it saw it . <EOS>

> вы настоящии джентльмен .
= you re a true gentleman .
< you re a real girl . <EOS>

> мне стыдно за мое прошлое .
= i m ashamed of my past .
< i m ashamed of my own . <EOS>

> у нее плохое настроение .
= she is in a bad mood .
< she is in a bad mood . <EOS>

> они довольно новые .
= they re pretty new .
< they re pretty pretty . <EOS>

> она укладывает детеи .
= she s putting the children to bed .
< she is a young . <EOS>

> я начну сначала .
= i m going to start over .
< i m going to take a . . <EOS>



In [87]:
res['LSTM'] = LSTM_2
res

Unnamed: 0,1_GRU,2_GRU,LSTM
0,3.114554,3.148655,3.203134
1,2.625627,2.664298,2.759481
2,2.342106,2.431606,2.565408
3,2.144511,2.213844,2.392207
4,2.014533,2.086283,2.226007
5,1.839546,1.943527,2.118267
6,1.75639,1.810832,1.988666
7,1.639556,1.715083,1.904871
8,1.558369,1.61035,1.793425
9,1.490343,1.542077,1.747118
