# Tutorial: NLP From Scratch: Translation with a Sequence to Sequence Network and Attention


* [Tutorial Link](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)
* [Data Link](https://download.pytorch.org/tutorial/data.zip)

In [1]:
from io import open
import unicodedata
import string
import re
import random
from IPython import display as disp
import itertools

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [2]:
# Device:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Print / Trace Helpers 

In [3]:
def printt(obj, text, info_only=False):
    # Print tensor info.
    
    global TRACE_ON
    
    def _p(obj_ii, ii):
        print(">> {} <{}-th>\nShape: {}".format(text, ii, obj_ii.shape))
        if not info_only:
            print(obj_ii)
        print()
    
    if TRACE_ON:
        
        if isinstance(obj, torch.Tensor):
            print(">> {}\nShape: {}".format(text, obj.shape))
            if not info_only:
                print(obj)
        
        elif isinstance(obj, (tuple, list)):
            for ii, obj_ii in enumerate(obj):
                _p(obj_ii, ii)
        
        print()
            

def printv(var, text):
    # Print variable.
    global TRACE_ON
    if TRACE_ON:
        print(">> {}:\n{}".format(text, var))
        print()

        
def printx(text):
    # Just ptint some text.
    global TRACE_ON
    if TRACE_ON:
        print(text)
        print()

## Data Processing

In [4]:
DATA_PATH = "./data/"

In [5]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def __repr__(self):
        n_examples = 10
        _repr = "name: {}\n".format(self.name)
        _repr += "n_words: {}\n".format(self.n_words)
        _repr += "word2count [:5]: {}\n".format(tuple(itertools.islice(self.word2count.items(), n_examples)))
        _repr += "word2index [:5]: {}\n".format(tuple(itertools.islice(self.word2index.items(), n_examples)))
        _repr += "index2word [:5]: {}\n".format(tuple(itertools.islice(self.index2word.items(), n_examples)))
        return _repr

In [6]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [7]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(DATA_PATH + '%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [8]:
# Filter down to sentences of `MAX_LENGTH` and only those that begin with `eng_prefixes`.

MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [9]:
# Data preparation script itself.

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['tu t exprimes bien .', 'you re articulate .']


In [10]:
# Preview the processed data.

disp.display(pairs[:10])

print("\n[input_lang]")
disp.display(input_lang)

print("\n[output_lang]")
disp.display(output_lang)

[['j ai ans .', 'i m .'],
 ['je vais bien .', 'i m ok .'],
 ['ca va .', 'i m ok .'],
 ['je suis gras .', 'i m fat .'],
 ['je suis gros .', 'i m fat .'],
 ['je suis en forme .', 'i m fit .'],
 ['je suis touche !', 'i m hit !'],
 ['je suis touchee !', 'i m hit !'],
 ['je suis malade .', 'i m ill .'],
 ['je suis triste .', 'i m sad .']]


[input_lang]


name: fra
n_words: 4345
word2count [:5]: (('j', 414), ('ai', 340), ('ans', 55), ('.', 10262), ('je', 3654), ('vais', 245), ('bien', 104), ('ca', 104), ('va', 51), ('suis', 2544))
word2index [:5]: (('j', 2), ('ai', 3), ('ans', 4), ('.', 5), ('je', 6), ('vais', 7), ('bien', 8), ('ca', 9), ('va', 10), ('suis', 11))
index2word [:5]: ((0, 'SOS'), (1, 'EOS'), (2, 'j'), (3, 'ai'), (4, 'ans'), (5, '.'), (6, 'je'), (7, 'vais'), (8, 'bien'), (9, 'ca'))


[output_lang]


name: eng
n_words: 2803
word2count [:5]: (('i', 4305), ('m', 3480), ('.', 10373), ('ok', 10), ('fat', 19), ('fit', 10), ('hit', 4), ('!', 82), ('ill', 7), ('sad', 14))
word2index [:5]: (('i', 2), ('m', 3), ('.', 4), ('ok', 5), ('fat', 6), ('fit', 7), ('hit', 8), ('!', 9), ('ill', 10), ('sad', 11))
index2word [:5]: ((0, 'SOS'), (1, 'EOS'), (2, 'i'), (3, 'm'), (4, '.'), (5, 'ok'), (6, 'fat'), (7, 'fit'), (8, 'hit'), (9, '!'))

## Seq2Seq Model 

In [11]:
# Encoder.

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input_, hidden):
        embedded = self.embedding(input_).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [12]:
# Simple decoder.

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_, hidden):
        output = self.embedding(input_).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [13]:
# Attention decoder.

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_, hidden, encoder_outputs):
        embedded = self.embedding(input_).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

## Training

In [15]:
# Training function: ONE ITERATION.

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH, debug_mode=False):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    printt(encoder_outputs, "encoder_outputs")
    
    loss = 0
    
    printx("ENCODER LOOP ... ... ... ... ... ... ... ...\nfor ei in range(input_length):")
    for ei in range(input_length):
        printv(ei, "ei")
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        printt(encoder_hidden, "encoder_hidden", info_only=True)
        # TODO: print --> encoder_output
        encoder_outputs[ei] = encoder_output[0, 0]
    printx("... ... ... ... ... ... ... ... ... ...")

    decoder_input = torch.tensor([[SOS_token]], device=device)
    printt(decoder_input, "decoder_input")
    
    decoder_hidden = encoder_hidden
    printx("decoder_hidden = encoder_hidden")
    printt(decoder_hidden, "decoder_hidden", info_only=True)

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if debug_mode:
        use_teacher_forcing = False
    printv(use_teacher_forcing, "use_teacher_forcing")
    
    printx("DECODER LOOP ... ... ... ... ... ... ... ...\nfor di in range(target_length):")
    
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        
        for di in range(target_length):
            printv(di, "di")
            
            if isinstance(decoder, AttnDecoderRNN):
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                printt(decoder_output, "decoder_output", info_only=True)
                printt(decoder_hidden, "decoder_hidden", info_only=True)
            
            target = target_tensor[di]
            printt(target, "target")
            
            loss += criterion(decoder_output, target)
            printv(loss.item(), "loss += criterion(decoder_output, target)")
            
            decoder_input = target_tensor[di]  # Teacher forcing
            printx("decoder_input = target_tensor[di]")
            printt(decoder_input, "decoder_input")

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            printv(di, "di")
            
            if isinstance(decoder, AttnDecoderRNN):
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                printt(decoder_output, "decoder_output", info_only=True)
                printt(decoder_hidden, "decoder_hidden", info_only=True)
            
            topv, topi = decoder_output.topk(1)
            printx("topv, topi = decoder_output.topk(1)")
            printv(topv, "topv")
            printv(topi, "topi")
            
            decoder_input = topi.squeeze().detach()  # detach from history as input
            printx("decoder_input = topi.squeeze().detach()")
            printt(decoder_input, "decoder_input")

            target = target_tensor[di]
            printt(target, "target")
            
            loss += criterion(decoder_output, target)
            printv(loss.item(), "loss += criterion(decoder_output, target)")
            
            if decoder_input.item() == EOS_token:
                break
    
    printx("... ... ... ... ... ... ... ... ... ...")
    
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [16]:
# Timer helpers.

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:
* Start a timer
* Initialize optimizers and criterion
* Create set of training pairs
* Start empty losses array for plotting

Then we call `train` many times and occasionally print the progress (% of examples, time so far, estimated time) and average loss

In [17]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, debug_mode=False):
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    picked_pairs = [random.choice(pairs) for i in range(n_iters)]
    training_pairs = [tensorsFromPair(p) for p in picked_pairs]
    
    printx("Example sentence data: ---")
    printv(picked_pairs[0], "picked_pairs[0]")
    printt(training_pairs[0], "training_pairs[0]")
    printx("--- --- --- --- --- --- ---")
    
    criterion = nn.NLLLoss()
    
    # If in debug mode, only one iteration will be run.
    if debug_mode:
        n_iters = 1
    
    for iter in range(1, n_iters + 1):
        
        printv(iter, "iter")
        
        training_pair = training_pairs[iter - 1]
        printt(training_pair, "training_pair")
        
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, debug_mode=debug_mode)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0 or debug_mode:
            print_loss_avg = print_loss_total / print_every
            printv(print_loss_avg, "print_loss_avg")
            
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    
    if not debug_mode:
        showPlot(plot_losses)

## Evaluation 

In [18]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    
    with torch.no_grad():
        
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            
            if isinstance(decoder, AttnDecoderRNN):
                decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs)
                decoder_attentions[di] = decoder_attention.data
            else:
                decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.data.topk(1)
            
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [19]:
def evaluateRandomly(encoder, decoder, n=10, seed=12345):
    for i in range(n):
        random.seed(seed + i)
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

## Visualisaton 

In [20]:
import matplotlib.pyplot as plt
# plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

## Execute Training and Evaluation 

In [21]:
# Set reproducibility.
TRAIN_SEED = 42

random.seed(TRAIN_SEED)

torch.manual_seed(TRAIN_SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
USE_ATTENTION = False
EPOCHS= 75000  # 75000, 10000
PRINT_EVERY = 500  # 5000, 500

hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)

if USE_ATTENTION:
    decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
else:
    decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)


TRACE_ON = False  # Print detailed debug info.
trainIters(encoder1, decoder1, EPOCHS, print_every=PRINT_EVERY, debug_mode=TRACE_ON)

0m 14s (- 36m 19s) (500 0%) 3.5185
0m 22s (- 27m 33s) (1000 1%) 3.3353
0m 30s (- 24m 33s) (1500 2%) 3.0724
0m 37s (- 22m 59s) (2000 2%) 2.9439
0m 45s (- 22m 10s) (2500 3%) 2.9132
0m 53s (- 21m 31s) (3000 4%) 2.8543
1m 1s (- 21m 1s) (3500 4%) 2.7399
1m 9s (- 20m 36s) (4000 5%) 2.5961
1m 17s (- 20m 17s) (4500 6%) 2.6216
1m 25s (- 20m 1s) (5000 6%) 2.6076
1m 33s (- 19m 46s) (5500 7%) 2.5676
1m 42s (- 19m 33s) (6000 8%) 2.4761
1m 50s (- 19m 21s) (6500 8%) 2.4661
1m 58s (- 19m 10s) (7000 9%) 2.4499
2m 6s (- 18m 57s) (7500 10%) 2.3975
2m 14s (- 18m 46s) (8000 10%) 2.3476
2m 22s (- 18m 36s) (8500 11%) 2.2955
2m 30s (- 18m 25s) (9000 12%) 2.3179
2m 38s (- 18m 15s) (9500 12%) 2.2531
2m 46s (- 18m 4s) (10000 13%) 2.2226
2m 54s (- 17m 54s) (10500 14%) 2.1521
3m 2s (- 17m 44s) (11000 14%) 2.1316
3m 10s (- 17m 34s) (11500 15%) 2.0186
3m 19s (- 17m 25s) (12000 16%) 2.1367
3m 27s (- 17m 15s) (12500 16%) 1.9492
3m 35s (- 17m 6s) (13000 17%) 2.0758
3m 43s (- 16m 56s) (13500 18%) 1.9855
3m 51s (- 16m 47

In [None]:
EVAL_SEED = 12345
torch.manual_seed(EVAL_SEED)

evaluateRandomly(encoder1, decoder1, seed=EVAL_SEED)

In [None]:
# TODO: Investigate BATCHING!