# **Sequence to sequence**

In [1]:
import random
import sys

import numpy as np
import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
MAX_SENTENCE_LENGTH = 10

In [4]:
class Dictionary:
    def __init__(self, dict_name):
        self.dict_name = dict_name
        self.word_2_idx = {
            "<sos>": 0,
            "<eos>": 1,
            "<unk>": 2
        }
        self.idx_2_word = {
            0: "<sos>",
            1: "<eos>",
            2: "<unk>"
        }
        self.word_2_count = {
            "<sos>": 1,
            "<eos>": 1,
            "<unk>": 1
        }
        self.n_words = 3

    def add_word(self, word):
        if word not in self.word_2_idx:
            self.word_2_idx[word] = self.n_words
            self.idx_2_word[self.n_words] = word
            self.n_words += 1
            self.word_2_count[word] = 1
        else:
            self.word_2_count[word] += 1

    def add_sentence(self, sentence):
        words = sentence.split()

        for word in words:
            self.add_word(word)

    def get_len(self):
        return self.n_words

In [5]:
class Corpus():
    def __init__(self, src_dict, target_dict):
        self.src_dict = Dictionary(src_dict)
        self.target_dict = Dictionary(target_dict)

    def get_data(self, path):
        pairs = []
    
        with open(path, "r") as file:
            lines = file.read().strip().split("\n")

            for line in lines:
                src, target = line.split("; ")

                src_len = len(src.split())
                target_len = len(target.split())

                if src_len <= MAX_SENTENCE_LENGTH and target_len <= MAX_SENTENCE_LENGTH:
                    self.src_dict.add_sentence(src)
                    self.target_dict.add_sentence(target)

                    src = " ".join([
                        word
                        if self.src_dict.word_2_count[word] > 1 else "<unk>"
                        for word in src.split()
                    ])
                    
                    target = " ".join([
                        word
                        if self.target_dict.word_2_count[word] > 1 else "<unk>"
                        for word in target.split()
                    ])

                    pairs.append([src, target])

        return pairs, self.src_dict, self.target_dict

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, emb_size, dropout_p):
        super(Encoder, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.emb_size = emb_size

        self.embedding = nn.Embedding(self.input_size, self.emb_size)
        self.dropout = nn.Dropout(dropout_p)
        self.GRU = nn.GRU(self.emb_size, self.hidden_size)

    def forward(self, input, hidden):
        # emb_input size = [1, 1, self.emb_size]
        emb_input = self.dropout(self.embedding(input)).view(1, 1, -1)

        # out size = [1, 1, self.hidden_size], hidden size = [1, 1, self.hidden_size]
        out, hidden = self.GRU(emb_input, hidden)
        
        return out, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size).to(device)

In [7]:
class Decoder(nn.Module):
    def __init__(self, out_size, hidden_size, emb_size, dropout_p):
        super(Decoder, self).__init__()

        self.out_size = out_size
        self.hidden_size = hidden_size
        self.emb_size = emb_size

        self.embedding = nn.Embedding(self.out_size, self.emb_size)
        self.dropout = nn.Dropout(dropout_p)
        self.GRU = nn.GRU(self.emb_size, self.hidden_size)
        self.fc_1 = nn.Linear(self.hidden_size, self.out_size)

    def forward(self, input, hidden):
        # emb_input size = [1, 1, self.emb_size]
        emb_input = self.dropout(self.embedding(input)).view(1, 1, -1)

        # out size = [1, 1, self.hidden_size], hidden size = [1, 1, self.hidden_size]
        out, hidden = self.GRU(emb_input, hidden)

        # prediction size = [1, 1, self.out_size]
        prediction = torch.log_softmax(self.fc_1(out), dim=2)

        return prediction, hidden
        
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size).to(device)

In [8]:
class Seq_2_Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq_2_Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, target, teacher_forcing_ratio=0.5):
        src_length = src.size(0)
        target_length = target.size(0)

        encoder_hidden = encoder.init_hidden()

        for ei in range(src_length):
            encoder_output, encoder_hidden = encoder(src[ei], encoder_hidden)

        decoder_input = target[0]
        decoder_hidden = encoder_hidden

        decoder_outputs = torch.zeros(target_length - 1, 1, self.decoder.out_size).to(device)

        for di in range(target_length - 1):
            decoder_out, decoder_hidden = self.decoder.forward(decoder_input, decoder_hidden)
            
            decoder_outputs[di] = decoder_out

            use_teacher_forcing = random.random() < teacher_forcing_ratio

            if use_teacher_forcing:
                decoder_input = target[di + 1]
            else:
                prediction = torch.tensor([decoder_out.argmax()]).to(device)
                decoder_input = prediction

        return decoder_outputs.view(target_length - 1, -1)

In [9]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.xavier_uniform_(param.data.unsqueeze_(0))

In [10]:
def pair_2_tensor(pair, src_dict, target_dict):
    src = pair[0]
    src_indexes = [src_dict.word_2_idx[word] for word in src.split()]
    src_tensor = torch.tensor(src_indexes, dtype=torch.long, device=device).view(-1, 1)

    target = pair[1]
    target_indexes = [target_dict.word_2_idx[word] for word in target.split()]
    target_tensor = torch.tensor(target_indexes, dtype=torch.long, device=device).view(-1, 1)

    return src_tensor, target_tensor

In [11]:
epochs = 7
emb_size = 256
hidden_size = 512
dropout_p = 0.5
learning_rate = 7e-5

In [12]:
corpus = Corpus("en", "de")
pairs, src_dict, target_dict = corpus.get_data(
    "/content/sample_data/text_dataset_for_seq_2_seq.txt"
)

In [13]:
pairs_count = len(pairs)
src_dict_size = src_dict.n_words
target_dict_size = target_dict.n_words

In [14]:
encoder = Encoder(src_dict_size, hidden_size, emb_size, dropout_p).to(device)

In [15]:
decoder = Decoder(target_dict_size, hidden_size, emb_size, dropout_p).to(device)

In [16]:
seq_2_seq = Seq_2_Seq(encoder, decoder).to(device)
seq_2_seq.apply(init_weights)

Seq_2_Seq(
  (encoder): Encoder(
    (embedding): Embedding(2972, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (GRU): GRU(256, 512)
  )
  (decoder): Decoder(
    (embedding): Embedding(3799, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (GRU): GRU(256, 512)
    (fc_1): Linear(in_features=512, out_features=3799, bias=True)
  )
)

In [17]:
optimizer = optim.Adam(seq_2_seq.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [18]:
for epoch in range(epochs):
    epoch_loss = 0

    seq_2_seq.train()

    for pair in tqdm.tqdm(pairs, file=sys.stdout):
        # src_size = [src_len, 1], target_size = [target_len, 1]
        src, target = pair_2_tensor(pair, src_dict, target_dict)
        
        optimizer.zero_grad()

        out = seq_2_seq(src, target)
        loss = criterion(out, target[1:].view(-1))

        loss.backward()
        torch.nn.utils.clip_grad_norm_(seq_2_seq.parameters(), 1)
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch}; loss: {epoch_loss / pairs_count}\n")

100%|██████████| 4003/4003 [01:07<00:00, 59.58it/s]
Epoch 0; loss: 0.9961017623387373

100%|██████████| 4003/4003 [01:07<00:00, 59.65it/s]
Epoch 1; loss: 0.7135505962004839

100%|██████████| 4003/4003 [01:07<00:00, 59.26it/s]
Epoch 2; loss: 0.5835310007090597

100%|██████████| 4003/4003 [01:06<00:00, 60.49it/s]
Epoch 3; loss: 0.49330301802670234

100%|██████████| 4003/4003 [01:07<00:00, 59.64it/s]
Epoch 4; loss: 0.4533368889526269

100%|██████████| 4003/4003 [01:06<00:00, 60.10it/s]
Epoch 5; loss: 0.37534164499790146

100%|██████████| 4003/4003 [01:06<00:00, 60.39it/s]
Epoch 6; loss: 0.34541120628843536



In [19]:
with torch.no_grad():
    seq_2_seq.eval()

    random_pair = random.choice(pairs)

    src, target = pair_2_tensor(random_pair, src_dict, target_dict)

    # teacher_forcing_ratio = 0 when "eval" is on!
    out = seq_2_seq.forward(src, target, teacher_forcing_ratio=0)
    
    predict = " ".join([
        target_dict.idx_2_word[idx.item()]
        for idx in out.argmax(dim=1)
    ])

    print(f"source:  ", random_pair[0])
    print(f"target:  ", random_pair[1])
    print(f"predict: ", "<sos> " + predict, "\n")

source:   <sos> a dog is running through tall grass <eos>
target:   <sos> ein hund rennt durch hohes gras <eos>
predict:  <sos> ein hund rennt durch das rasen <eos> 

