In this project we will be teaching a neural network to translate from French to English.

## Initialization

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re # regex
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device.type}")

device: cuda


## Loading Data File

In [4]:
SOS_token = 0
EOS_token = 1

In [6]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 # count SOS and EOS
        
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
            
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

The files are all in Unicode, to simplify we will turn Unicode characters to ASCII, make everything lowercase, and trim most punctuation.

In [7]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [11]:
def readLangs(lang1="fra", lang2="eng", reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(f"data/{lang1}-{lang2}.txt", encoding='utf-8').read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1) # create Lang objects
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

Since there are a lot of example sentences and we want to train something quickly, we’ll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we’re filtering to sentences that translate to the form “I am” or “He is” etc. (accounting for apostrophes replaced earlier).

In [10]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [12]:
def prepareData(lang1, lang2, reverse=False):
    
    # read source text and turn it into sentence pairs
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print(f"Read {len(pairs)} sentence pairs")
    
    # do filtering
    pairs = filterPairs(pairs)
    print(f"Trimmed to {len(pairs)} sentence pairs")
    
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0]) # map word to index
        output_lang.addSentence(pair[1])
        
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, pairs

In [14]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803


In [17]:
print(random.choice(pairs))

['ils sont probablement etasuniens .', 'they re probably americans .']


## The Seq2Seq Model

In [18]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        
        gru_output, gru_hidden = self.gru(output, hidden)
        
        return gru_output, gru_hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = device)

In [19]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size) # output_size: number of fra words
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = F.relu(embedded) # why relu
        
        gru_output, gru_hidden = self.gru(embedded, hidden)
        output = self.out(gru_output[0])
        output = self.softmax(output)
        
        return output, gru_hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = device)

## Prepare Training Data

In [20]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromPair(pair): # pair: [eng sentence, fra sentence]
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

## Training the Model

To train we run the input sentence through the encoder, and keep track of every output and the latest hidden state. Then the decoder is given the `<SOS>` token as its first input, and the last hidden state of the encoder as its first hidden state.

“Teacher forcing” is the concept of using the real target outputs as each next input, instead of using the decoder’s guess as the next input. Using teacher forcing causes it to converge faster but when the trained network is exploited, it may exhibit instability.

You can observe outputs of teacher-forced networks that read with coherent grammar but wander far from the correct translation - intuitively it has learned to represent the output grammar and can “pick up” the meaning once the teacher tells it the first few words, but it has not properly learned how to create the sentence from the translation in the first place.

Because of the freedom PyTorch’s autograd gives us, we can randomly choose to use teacher forcing or not with a simple if statement. Turn `teacher_forcing_ratio` up to use more of it.

In [22]:
teacher_forcing_ratio = 0.5

In [48]:
# traing an example(pair)
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length = MAX_LENGTH):
    
    # init first hidden state of encoder GRU
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    # create tensor to store output before loop
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)
    loss = 0
    
    for e_idx in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[e_idx], encoder_hidden)
        encoder_outputs[e_idx] = encoder_output[0, 0]
        
    # first input for decoder is SOS indicating start of sentence
    decoder_input = torch.tensor([[SOS_token]], device = device)
    
    # CORE! hidden state of encoder will be passed to decoder!
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # Teacher Forcing: Feed the target as the next input
        for d_idx in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            loss += criterion(decoder_output, target_tensor[d_idx])
            
            decoder_input = target_tensor[d_idx] # teacher forcing
    else:
        # Without teacher forcing: use its own predictions as the next input
        for d_idx in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            # select word with highest possibility 
            top_v, top_i = decoder_output.topk(1)
            
            decoder_input = top_i.squeeze().detach() # detach from history as input
            
            loss += criterion(decoder_output, target_tensor[d_idx])
            if decoder_input.item() == EOS_token:
                break
                
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / target_length # average loss

This is a helper function to print time elapsed and estimated time remaining given the current time and progress %.

In [24]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [61]:
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    
    start = time.time()
    
    print_loss_total = 0 # reset every <print_every>
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    training_pairs = [tensorFromPair(random.choice(pairs)) for i in range(n_iters)]
    
    # loss function
    criterion = nn.NLLLoss() # The negative log likelihood loss. It is useful to train a classification problem with C classes.
    
    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        print_loss_total += loss
        
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0 # reset
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

In [44]:
HIDDEN_SIZE = 256

In [59]:
encoder = EncoderRNN(input_lang.n_words, HIDDEN_SIZE).to(device)
decoder = DecoderRNN(HIDDEN_SIZE, output_lang.n_words).to(device)

trainIters(encoder, decoder, 75000, print_every=5000)

0m 43s (- 10m 5s) (5000 6%) 2.9147
1m 23s (- 9m 1s) (10000 13%) 2.3748
2m 3s (- 8m 14s) (15000 20%) 2.0523
2m 43s (- 7m 29s) (20000 26%) 1.7770
3m 23s (- 6m 47s) (25000 33%) 1.6004
4m 4s (- 6m 6s) (30000 40%) 1.3791
4m 44s (- 5m 24s) (35000 46%) 1.2256
5m 24s (- 4m 43s) (40000 53%) 1.1098
6m 5s (- 4m 3s) (45000 60%) 0.9969
6m 45s (- 3m 22s) (50000 66%) 0.9080
7m 26s (- 2m 42s) (55000 73%) 0.7830
8m 7s (- 2m 1s) (60000 80%) 0.7233
8m 47s (- 1m 21s) (65000 86%) 0.6479
9m 28s (- 0m 40s) (70000 93%) 0.5743
10m 10s (- 0m 0s) (75000 100%) 0.5240


## Random Evaluation 

In [54]:
# evaluate() is similar to train() but no optimize & teacher forcing
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
    
    with torch.no_grad(): # Context-manager that disabled gradient calculation
        
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)
        
        for e_idx in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[e_idx], encoder_hidden)
            encoder_outputs[e_idx] += encoder_output[0, 0]
            
        decoder_input = torch.tensor([[SOS_token]], device=device) # decoder input starts with SOS
        
        decoder_hidden = encoder_hidden
        
        decoded_words = []
        
        # no teacher-forcing: use previous output as next cell's input
        for d_idx in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            top_v, top_i = decoder_output.data.topk(1)
            if top_i.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[top_i.item()])
                
            decoder_input = top_i.squeeze().detach() # make it to be input
            
        return decoded_words

In [57]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [60]:
evaluateRandomly(encoder, decoder)

> je ne suis pas petite .
= i m not short .
< i m not persuaded . <EOS>

> il est heroino dependant .
= he is a heroin addict .
< he s addicted to heroin . <EOS>

> elle est puissante .
= she is powerful .
< she is powerful . <EOS>

> vous etes en securite .
= you re safe .
< you re safe . <EOS>

> j ai peur des araignees .
= i m afraid of spiders .
< i m afraid of dogs . <EOS>

> il est dramaturge .
= he is a dramatist .
< he is a dj . <EOS>

> vous n etes pas ainsi d ordinaire .
= you re not usually like this .
< you re not usually like this . <EOS>

> je suis assez fatiguee .
= i m quite tired .
< i m quite tired . <EOS>

> ce sont des affaires dont nous devons parler .
= they are matters which we need to discuss .
< they are talking to the the game . <EOS>

> aujourd hui je ne patinerai pas .
= i m not going to skate today .
< i m not going to get leaving . <EOS>

